cutlass/regular__tile__iterator__tensor__op__sm70_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
 #include "cutlass/matrix_coord.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/layout/pitch_linear.h"
 #include "cutlass/layout/tensor_op_multiplicand_sm70.h"

 #include "cutlass/transform/threadblock/regular_tile_iterator.h"


 namespace cutlass {
 namespace transform {
 namespace threadblock {


 template <
   typename Shape_,
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
   int Alignment
 >
 class RegularTileIterator<
   Shape_,
   Element_,
   layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
   AdvanceRank,
   ThreadMap_,
   Alignment> {
 public:

   static_assert(AdvanceRank == 0 || AdvanceRank == 1,
     "Specialization for pitch-linear iterator may along advance along the "
     "contiguous(rank=0) or strided(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
   static int const kAdvanceRank = AdvanceRank;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   struct Detail {

     static int const kAccessSizeInBits = 128;

     static_assert(
       sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
       "This iterator requires a policy whose access size is 128bs");

     static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
   };


 private:

   using AccessType = Array<Element, Layout::kElementsPerAccess>;

 public:

   using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;

 private:

   //
   // Data members
   //

   Index stride_;

   AccessType * pointer_[Detail::kPointerCount];

   Index byte_offset_;

 public:

   CUTLASS_HOST_DEVICE
   RegularTileIterator(
     TensorRef ref,
     int thread_id
   ): stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {

     layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < Detail::kPointerCount; ++i) {

       // This is the offset of a thread within a threadblock tile for a specific pointer
       // (units of elements)
       layout::PitchLinearCoord thread_offset_in_threadblock_tile =
         thread_offset_base + layout::PitchLinearCoord{0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};

       // initialize pointer
       pointer_[i] = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
     }
   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {

     byte_offset_ += pointer_offset * sizeof(Element);
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {

     add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));

     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {

     RegularTileIterator prev(*this);
     this->operator++();

     return prev;
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     add_pointer_offset(
       coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
       coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
     );
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {

     AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);

     Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {

       AccessType *access_ptr = pointer_[s & 1];
       int stride_idx = (s & ~1);

       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {

         int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
             c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
             vec_pointer_offset;

         int access_idx = c + s * ThreadMap::Iterations::kContiguous;

         char const *access_byte_ptr = reinterpret_cast<char const *>(access_ptr + access_offset);

         frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(access_byte_ptr + byte_offset_);
       }
     }
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void store_with_pointer_offset(
     Fragment const &frag,
     Index pointer_offset) {

     AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);

     Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {

       AccessType *access_ptr = pointer_[s & 1];
       int stride_idx = (s & ~1);

       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {

         int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
           c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
           vec_pointer_offset;

         int access_idx = c + s * ThreadMap::Iterations::kContiguous;

         char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);

         *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
       }
     }
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) {
     store_with_pointer_offset(frag, 0);
   }
 };


 // Tile Iterator specialized for column-major congruous TensorOp formats.
 template <
   typename Shape_,
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
   int Alignment
 >
 class RegularTileIterator<
   Shape_,
   Element_,
   layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
   AdvanceRank,
   ThreadMap_,
   Alignment> {
 public:

   static_assert(AdvanceRank == 0 || AdvanceRank == 1,
     "Specialization for column-major iterator may along advance along the "
     "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
   static int const kAdvanceRank = AdvanceRank;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileIterator<
     layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
     Element,
     layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
     (kAdvanceRank == 0 ? 0 : 1),
     ThreadMap_>;

 public:

   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;

 private:

   UnderlyingIterator iterator_;

 public:

   CUTLASS_HOST_DEVICE
   RegularTileIterator(
     TensorRef ref,
     int thread_id
   ): iterator_({ref.data(), ref.stride()}, thread_id) {

   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.row(), coord.column()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {

     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {

     RegularTileIterator prev(*this);
     ++iterator_;

     return prev;
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void store_with_pointer_offset(
     Fragment const &frag,
     Index pointer_offset) {

     iterator_.store_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) {
     store_with_pointer_offset(frag, 0);
   }
 };


 template <
   typename Shape_,
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
   int Alignment
 >
 class RegularTileIterator<
   Shape_,
   Element_,
   layout::RowMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
   AdvanceRank,
   ThreadMap_,
   Alignment> {
 public:

   static_assert(AdvanceRank == 0 || AdvanceRank == 1,
     "Specialization for row-major iterator may along advance along the "
     "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::RowMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
   static int const kAdvanceRank = AdvanceRank;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileIterator<
     layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
     Element,
     layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
     (kAdvanceRank == 0 ? 1 : 0),
     ThreadMap_>;

 public:

   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;

 private:

   UnderlyingIterator iterator_;

 public:

   CUTLASS_HOST_DEVICE
   RegularTileIterator(
     TensorRef ref,
     int thread_id
   ): iterator_({ref.data(), ref.stride()}, thread_id) {

   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.column(), coord.row()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {

     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {

     RegularTileIterator prev(*this);
     ++iterator_;

     return prev;
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void store_with_pointer_offset(
     Fragment const &frag,
     Index pointer_offset) {

     iterator_.store_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) {
     store_with_pointer_offset(frag, 0);
   }
 };
 template <
   typename Shape_,
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
   int Alignment
 >
 class RegularTileIterator<
   Shape_,
   Element_,
   layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
   AdvanceRank,
   ThreadMap_,
   Alignment> {
 public:

   static_assert(AdvanceRank == 0 || AdvanceRank == 1,
     "Specialization for pitch-linear iterator may along advance along the "
     "contiguous(rank=0) or strided(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
   static int const kAdvanceRank = AdvanceRank;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   struct Detail {

     static int const kAccessSizeInBits = 128;

     static_assert(
       sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
       "This iterator requires a policy whose access size is 128bs");

     static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
   };


 private:

   using AccessType = Array<Element, Layout::kElementsPerAccess>;

 public:

   using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;

 private:

   //
   // Data members
   //

   Index stride_;

   AccessType * pointer_[Detail::kPointerCount];

   Index byte_offset_;

 public:

   CUTLASS_HOST_DEVICE
   RegularTileIterator(
     TensorRef ref,
     int thread_id
   ): stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {

     layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < Detail::kPointerCount; ++i) {

       // This is the offset of a thread within a threadblock tile for a specific pointer
       // (units of elements)
       layout::PitchLinearCoord thread_offset_in_threadblock_tile =
         thread_offset_base + layout::PitchLinearCoord{0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};

       // initialize pointer
       pointer_[i] = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
     }
   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {

     byte_offset_ += pointer_offset * sizeof(Element);
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {

     add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));

     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {

     RegularTileIterator prev(*this);
     this->operator++();

     return prev;
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     add_pointer_offset(
       coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
       coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
     );
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {

     AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);

     Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {

       AccessType *access_ptr = pointer_[s & 1];
       int stride_idx = (s & ~1);

       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {

         int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
             c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
             vec_pointer_offset;

         int access_idx = c + s * ThreadMap::Iterations::kContiguous;

         char const *access_byte_ptr = reinterpret_cast<char const *>(access_ptr + access_offset);

         frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(access_byte_ptr + byte_offset_);
       }
     }
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void store_with_pointer_offset(
     Fragment const &frag,
     Index pointer_offset) {

     AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);

     Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {

       AccessType *access_ptr = pointer_[s & 1];
       int stride_idx = (s & ~1);

       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {

         int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
           c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
           vec_pointer_offset;

         int access_idx = c + s * ThreadMap::Iterations::kContiguous;

         char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);

         *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
       }
     }
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) {
     store_with_pointer_offset(frag, 0);
   }
 };


 template <
   typename Shape_,
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
   int Alignment
 >
 class RegularTileIterator<
   Shape_,
   Element_,
   layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
   AdvanceRank,
   ThreadMap_,
   Alignment> {
 public:

   static_assert(AdvanceRank == 0 || AdvanceRank == 1,
     "Specialization for column-major iterator may along advance along the "
     "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
   static int const kAdvanceRank = AdvanceRank;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileIterator<
     layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
     Element,
     layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
     (kAdvanceRank == 0 ? 0 : 1),
     ThreadMap_>;

 public:

   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;

 private:

   UnderlyingIterator iterator_;

 public:

   CUTLASS_HOST_DEVICE
   RegularTileIterator(
     TensorRef ref,
     int thread_id
   ): iterator_({ref.data(), ref.stride()}, thread_id) {

   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.row(), coord.column()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {

     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {

     RegularTileIterator prev(*this);
     ++iterator_;

     return prev;
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void store_with_pointer_offset(
     Fragment const &frag,
     Index pointer_offset) {

     iterator_.store_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) {
     store_with_pointer_offset(frag, 0);
   }
 };


 template <
   typename Shape_,
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
   int Alignment
 >
 class RegularTileIterator<
   Shape_,
   Element_,
   layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
   AdvanceRank,
   ThreadMap_,
   Alignment> {
 public:

   static_assert(AdvanceRank == 0 || AdvanceRank == 1,
     "Specialization for row-major iterator may along advance along the "
     "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
   static int const kAdvanceRank = AdvanceRank;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileIterator<
     layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
     Element,
     layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
     (kAdvanceRank == 0 ? 1 : 0),
     ThreadMap_>;

 public:

   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;

 private:

   UnderlyingIterator iterator_;

 public:

   CUTLASS_HOST_DEVICE
   RegularTileIterator(
     TensorRef ref,
     int thread_id
   ): iterator_({ref.data(), ref.stride()}, thread_id) {

   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.column(), coord.row()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {

     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {

     RegularTileIterator prev(*this);
     ++iterator_;

     return prev;
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void store_with_pointer_offset(
     Fragment const &frag,
     Index pointer_offset) {

     iterator_.store_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) {
     store_with_pointer_offset(frag, 0);
   }
 };


 template <
   typename Shape_,
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
   int Alignment
 >
 class RegularTileIterator<
     Shape_, Element_,
     layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                                Shape_::kContiguous>,
     AdvanceRank, ThreadMap_, Alignment> {

  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for pitch-linear iterator may along advance along the "
       "contiguous(rank=0) or strided(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout =
       layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                                  Shape::kContiguous>;
   static int const kAdvanceRank = AdvanceRank;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   struct Detail {

     static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);

     static int const kIterarionsPerAccess =
         ThreadMap::kElementsPerAccess / Layout::kElementsPerAccess;

     static int const kContiguousElementsPerLine = 4;
   };

  private:
   using AccessType = Array<Element, Layout::kElementsPerAccess>;

  public:
   using Fragment =
       Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;

  private:
   //
   // Data members
   //

   Index line_size;

   AccessType *pointer_[Detail::kPointerCount];

   Index byte_offset_;


  public:
   CUTLASS_HOST_DEVICE
   RegularTileIterator(TensorRef ref,
                       int thread_id
                       )
       : line_size(ref.stride(0) * Detail::kContiguousElementsPerLine / Layout::kElementsPerAccess),
         byte_offset_(0) {

     layout::PitchLinearCoord thread_offset_base =
         ThreadMap::initial_offset(thread_id);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < Detail::kPointerCount; ++i) {
       // This is the offset of a thread within a threadblock tile for a specific
       // pointer (units of elements)
       layout::PitchLinearCoord thread_offset_in_threadblock_tile =
           thread_offset_base +
           layout::PitchLinearCoord{
               0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};

       // initialize pointer
       pointer_[i] = reinterpret_cast<AccessType *>(
           ref.data() + ref.offset(thread_offset_in_threadblock_tile));
     }
   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     byte_offset_ += pointer_offset * sizeof(Element);
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {
     // (Shape::kContiguous/Layout::kElementsPerAccess)*
     //   line_size * Layout::kElementsPerAccess
     add_pointer_offset(Shape::kContiguous * line_size);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {
     RegularTileIterator prev(*this);
     this->operator++();

     return prev;
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     add_pointer_offset((coord.contiguous() * (Shape::kContiguous / Layout::kElementsPerAccess) *
                        line_size + coord.strided() * Shape::kStrided) *
                        Layout::kElementsPerAccess);
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);

     Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
       AccessType *access_ptr = pointer_[(s & 1) ^ (s / 2)];

       access_ptr += 16 * (s / 2);

       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {

         CUTLASS_PRAGMA_UNROLL
         for(int i = 0; i < Detail::kIterarionsPerAccess; ++i) {

           int access_offset =
             c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size +
             vec_pointer_offset + i * line_size;

           int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
             Detail::kIterarionsPerAccess + i;

           char const *access_byte_ptr = reinterpret_cast<char const*>(access_ptr + access_offset);

           frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
               access_byte_ptr + byte_offset_);
         }
       }
     }
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }

   CUTLASS_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
     AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);

     Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
       AccessType *access_ptr = pointer_[(s & 1) ^ ((s >> 1) & 1)];

       access_ptr += 16 * (s / 2);

       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
         CUTLASS_PRAGMA_UNROLL
         for(int i = 0; i < Detail::kIterarionsPerAccess; ++i) {

           int access_offset =
             c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size +
             vec_pointer_offset + i * line_size;

           int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
             Detail::kIterarionsPerAccess + i;

           char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);

           *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) =
               frag_ptr[access_idx];
         }
       }
     }
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
 };


 template <
   typename Shape_,
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
   int Alignment
 >
 class RegularTileIterator<Shape_, Element_,
                           layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
                               sizeof_bits<Element_>::value, Shape_::kRow>,
                           AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for column-major iterator may along advance along the "
       "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
       sizeof_bits<Element_>::value, Shape::kRow>;
   static int const kAdvanceRank = AdvanceRank;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
       layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                             Shape::kRow>,
       (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;

  public:
   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;

  private:
   UnderlyingIterator iterator_;

  public:
   CUTLASS_HOST_DEVICE
   RegularTileIterator(TensorRef ref,
                       int thread_id
                       )
       : iterator_({ref.data(), ref.stride()}, thread_id) {}

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.row(), coord.column()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {
     RegularTileIterator prev(*this);
     ++iterator_;

     return prev;
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }

   CUTLASS_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
     iterator_.store_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
 };


 template <
   typename Shape_,
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
   int Alignment
 >
 class RegularTileIterator<Shape_, Element_,
                           layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
                               sizeof_bits<Element_>::value, Shape_::kColumn>,
                           AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for row-major iterator may along advance along the "
       "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
       sizeof_bits<Element_>::value, Shape::kColumn>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
       layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                                  Shape::kColumn>,
       (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;

  public:
   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;

  private:
   UnderlyingIterator iterator_;

  public:
   CUTLASS_HOST_DEVICE
   RegularTileIterator(TensorRef ref,
                       int thread_id
                       )
       : iterator_({ref.data(), ref.stride()}, thread_id) {}

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.column(), coord.row()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {
     RegularTileIterator prev(*this);
     ++iterator_;

     return prev;
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }

   CUTLASS_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
     iterator_.store_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
 };


 } // namespace threadblock
 } // namespace transform
 } // namespace cutlass
cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op_sm70.h:438

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op_sm70.h:769

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:630

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op_sm70.h:775

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:537

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:955

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:1384

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op_sm70.h:1373

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:160

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op_sm70.h:1258

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op_sm70.h:297

cutlass::layout::VoltaTensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:69

cutlass
Definition: aligned_buffer.h:35

cutlass::layout::PitchLinearCoord
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:833

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:963

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1315

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1181

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op_sm70.h:1248

cutlass::TensorRef::data
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op_sm70.h:1360

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op_sm70.h:432

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:522

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op_sm70.h:562

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op_sm70.h:302

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:471

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op_sm70.h:899

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:735

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1219

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:859

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op_sm70.h:778

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:843

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:186

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op_sm70.h:303

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1321

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op_sm70.h:553

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:377

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op_sm70.h:774

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:825

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:1393

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:167

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1299

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:451

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:1293

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_iterator_tensor_op_sm70.h:1249

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op_sm70.h:1254

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1444

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op_sm70.h:1041

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:501

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:321

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:540

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm70.h:848

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:849

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1185

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:639

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:610

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:703

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op_sm70.h:1037

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op_sm70.h:81

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, ThreadMap::Iterations::kCount *Layout::kElementsPerAccess > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:117

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:491

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_iterator_tensor_op_sm70.h:554

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op_sm70.h:86

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op_sm70.h:559

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:1406

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_iterator_tensor_op_sm70.h:900

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:642

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:462

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1306

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:1114

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op_sm70.h:433

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:477

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op_sm70.h:1367

cutlass::operator++
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op_sm70.h:908

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:985

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1412

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:200

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:658

cutlass::TensorRef::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:392

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:528

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_iterator_tensor_op_sm70.h:428

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:979

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:483

cutlass::layout::VoltaTensorOpMultiplicandBCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:405

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op_sm70.h:90

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_iterator_tensor_op_sm70.h:1361

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:1280

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:195

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1331

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_iterator_tensor_op_sm70.h:770

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:203

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:697

cutlass::TensorRef< Element, Layout >

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:176

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op_sm70.h:558

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:383

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op_sm70.h:92

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:943

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:667

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:951

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1428

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1146

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:293

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op_sm70.h:306

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:305

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:864

cutlass::layout::VoltaTensorOpMultiplicandBCongruous
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:397

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op_sm70.h:436

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1438

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:263

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:302

cutlass::TensorRef::offset
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:923

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:1088

cutlass::transform::threadblock::RegularTileIterator
Definition: regular_tile_iterator.h:50

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:855

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op_sm70.h:1371

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op_sm70.h:905

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1434

cutlass::layout::VoltaTensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:72

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:341

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1325

cutlass::layout::VoltaTensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:741

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:819

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:973

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op_sm70.h:427

cutlass::layout::VoltaTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm70.h:733

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op_sm70.h:564

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1129

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm70.h:943

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:639

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:332

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op_sm70.h:1043

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:632

cutlass::layout::VoltaTensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:744

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op_sm70.h:87

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_iterator_tensor_op_sm70.h:298

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:954

matrix_coord.h
Defines a canonical coordinate for rank=2 matrices offering named indices.

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:353

regular_tile_iterator.h
Templates implementing storing of tiles from pitch-linear rank=2 tensors.

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op_sm70.h:1260

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:371

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:347

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op_sm70.h:904

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_iterator_tensor_op_sm70.h:82

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op_sm70.h:308

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op_sm70.h:1030

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1419

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:1400

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:1287

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op_sm70.h:780

pitch_linear.h
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:1271

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kColumn >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op_sm70.h:1368

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:793

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:949

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:856

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:231

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:507

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:191

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op_sm70.h:910

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:994

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:648

tensor_op_multiplicand_sm70.h

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:138

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op_sm70.h:1038

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:361

cutlass::layout::VoltaTensorOpMultiplicandBCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:408

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:1138

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, ThreadMap::Iterations::kCount *ThreadMap::kElementsPerAccess > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:1066

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_iterator_tensor_op_sm70.h:1031

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, ThreadMap::Iterations::kCount *Layout::kElementsPerAccess > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:589

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:513

cutlass::layout::VoltaTensorOpMultiplicandCongruous
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:60

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kRow >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op_sm70.h:1255

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:225

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:804

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:934

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Shape_::kContiguous >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1120

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:813