cutlass/mma__tensor__op__tile__iterator_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"

 #include "cutlass/array.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/matrix_shape.h"

 #include "cutlass/arch/memory_sm75.h"
 #include "cutlass/gemm/gemm.h"

 #include "cutlass/layout/matrix.h"
 #include "cutlass/layout/tensor.h"
 #include "cutlass/layout/pitch_linear.h"
 #include "cutlass/layout/tensor_op_multiplicand_sm75.h"

 #include "cutlass/platform/platform.h"
 #include "cutlass/fast_math.h"


 namespace cutlass {
 namespace gemm {
 namespace warp {


 template <
     typename Shape_,
     Operand Operand,
     typename Element_,
     typename Layout_,
     typename InstructionShape_,
     int OpDelta_,
     int Threads,
     int PartitionsK_ = 1>
 class MmaTensorOpMultiplicandTileIterator;


 template <
     typename Shape_,
     Operand Operand_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_,
     int PartitionsK_>
 class MmaTensorOpMultiplicandTileIterator<
     Shape_, Operand_, Element_,
     cutlass::layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                                    64>,
     InstructionShape_, OpDelta_, 32, PartitionsK_> {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand_;

   static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
     "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");

   using Element = Element_;

   using Layout = cutlass::layout::TensorOpMultiplicandCongruous<
       sizeof_bits<Element_>::value, 64>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   static int const kPartitionsK = PartitionsK_;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   struct Policy {
     static_assert(
         !(Shape::kContiguous % InstructionShape::kContiguous),
         "Shape of warp-level Mma must be divisible by operator shape.");

     // Determine number of elements along outer dimension per individual LDSM op
     static int const kLdsmOpOuter = Layout::kElementsPerAccess;
     static int const kLdsmOpInner = 8;

     static_assert(!(Shape::kContiguous % kLdsmOpOuter),
       "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");

     static_assert(!(Shape::kStrided % kLdsmOpInner),
       "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");

     static int const LdsmShapeStrided =
         InstructionShape::kStrided / kLdsmOpInner;
     static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
     using LdsmShape =
         layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;

     using LdsmIterations = layout::PitchLinearShape<
         Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
         1>;

     static int const kGroupsPerTile =
         Shape::kStrided / InstructionShape::kStrided;
   };

 private:

   static_assert(kOpDelta == 1,
     "Alternative arrangements not supported at present.");

   static int const kPointerCount =
       Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous;

   using AccessType = Array<Element, Layout::kElementsPerAccess>;

   int k_group_idx_;

 public:

   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads>;

 private:

   Index stride_;

   AccessType const *pointer_[kPointerCount];

   Index byte_offset_;

 public:

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator(
     TensorRef const &ref,
     int lane_id
   ):
     stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0),
     k_group_idx_(0) {

     int quad_pair = (lane_id >> 3);
     int lane_in_quad = (lane_id & 3);
     int lane_in_quad_pair = (lane_id & 7);
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kPointerCount; ++i) {
       int partition_contiguous_idx = -1;
       int access_contiguous_idx = -1;
       int access_strided_idx = -1;

       if (Policy::LdsmShape::kContiguous == 4) {
         partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ i);
         access_contiguous_idx = (quad_pair ^ lane_in_quad);
         access_strided_idx = lane_in_quad_pair;
       }
       int access_contiguous =
           partition_contiguous_idx * Layout::PartitionShape::kContiguous +
           access_contiguous_idx;

       int access_strided = access_strided_idx;

       pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
                     access_contiguous + access_strided * stride_;
     }
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {

     byte_offset_ += offset * sizeof(Element);

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     int contiguous_offset = tile_offset.contiguous();
     if (Shape::kContiguous ==
         Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
       if (tile_offset.contiguous() % 2) {
         CUTLASS_PRAGMA_UNROLL
         for (int i = 0; i < kPointerCount / 2; ++i) {
           AccessType const *tmp_pointer = pointer_[i];
           pointer_[i] = pointer_[i + kPointerCount / 2];
           pointer_[i + kPointerCount / 2] = tmp_pointer;
         }
       }
       contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
     }

     int offset = (tile_offset.strided() * InstructionShape::kStrided) *
                      stride_ * Layout::kElementsPerAccess +
                  contiguous_offset * Shape::kContiguous;

     add_pointer_offset(offset);

     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator & operator++() {

     add_tile_offset({0, 1});

     if (kPartitionsK > 1) {
       ++k_group_idx_;
       // Jump to next stage
       if (k_group_idx_ == Policy::kGroupsPerTile) {
         k_group_idx_ = 0;
         add_tile_offset(
             {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
       }
     }

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator & operator--() {
     byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
                     Layout::kElementsPerAccess;

     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {

     load_with_byte_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {

     Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
       reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {

       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {

         int access_idx = c + s * Policy::LdsmIterations::kContiguous;

         AccessType const *source_ptr =
             pointer_[c % kPointerCount] +
             Layout::TileShape::kContiguous * (c / kPointerCount) +
             Policy::LdsmShape::kStrided * s * stride_;

         char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;

         cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
           fetch_ptr[access_idx],
           source_byte_ptr
         );
       }
     }
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     load_with_byte_offset(frag, pointer_offset * sizeof(Element));
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     load_with_byte_offset(frag, tile_offset, 0);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     Index pointer_offset =
       tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess +
       tile_offset.strided() * InstructionShape::kStrided * stride_;

     byte_offset += sizeof(AccessType) * pointer_offset;

     load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     // no op
   }
 };

 template <
     typename Shape_,
     Operand Operand_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_,
     int PartitionsK_>
 class MmaTensorOpMultiplicandTileIterator<
     Shape_, Operand_, Element_,
     cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
         sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
     InstructionShape_, OpDelta_, 32, PartitionsK_> {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand_;

   static_assert(kOperand == Operand::kA,
                 "MmaTensorOpMultiplicandIterator for ColumnMajor Congruous may "
                 "only be instantiated for A operand to warp-level Mma.");

   using Element = Element_;

   using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
       sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   using Base = MmaTensorOpMultiplicandTileIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                             int(128 / sizeof(Element_))>,
       layout::PitchLinearShape<InstructionShape::kRow,
                                InstructionShape::kColumn>,
       kOpDelta, kThreads, PartitionsK_>;

  public:

   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads>;

 private:

   Base iterator_;

 public:

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator(
     TensorRef const &ref,
     int lane_id
   ): iterator_({ref.data(), ref.stride()}, lane_id) {
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {

     iterator_.add_pointer_offset(offset);

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator & operator++() {

     ++iterator_;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator & operator--() {

     --iterator_;

     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {

     iterator_.load(frag);
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     // TODO
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     // TODO
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(
       frag,
       {tile_offset.contiguous(), tile_offset.strided()},
       byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     iterator_.set_kgroup_index(k_group);
   }
 };


 template <
     typename Shape_,
     Operand Operand_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_,
     int PartitionsK_>
 class MmaTensorOpMultiplicandTileIterator<
     Shape_, Operand_, Element_,
     cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
         sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
     InstructionShape_, OpDelta_, 32, PartitionsK_> {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand_;

   static_assert(kOperand == Operand::kB,
                 "MmaTensorOpMultiplicandIterator for RowMajor Congruous may "
                 "only be instantiated for B operand to warp-level Mma.");

   using Element = Element_;

   using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
       sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   using Base = MmaTensorOpMultiplicandTileIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                             int(128 / sizeof(Element_))>,
       layout::PitchLinearShape<InstructionShape::kColumn,
                                InstructionShape::kRow>,
       kOpDelta, kThreads, PartitionsK_>;

  public:

   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads>;

 private:

   Base iterator_;

 public:

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator(
     TensorRef const &ref,
     int lane_id
   ): iterator_({ref.data(), ref.stride()}, lane_id) {
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {

     iterator_.add_pointer_offset(offset);

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator & operator++() {

     ++iterator_;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator & operator--() {

     --iterator_;

     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {

     iterator_.load(frag);
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     // TODO
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     // TODO
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(
       frag,
       {tile_offset.strided(), tile_offset.contiguous()},
       byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     iterator_.set_kgroup_index(k_group);
   }
 };


 template <
     typename Shape_,
     Operand Operand_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_,
     int Crosswise,
     int PartitionsK_>
 class MmaTensorOpMultiplicandTileIterator<
     Shape_, Operand_, Element_,
     cutlass::layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                                    Crosswise>,
     InstructionShape_, OpDelta_, 32, PartitionsK_> {
  public:
   using Shape = Shape_;

   static Operand const kOperand = Operand_;

   static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
                 "MmaTensorOpMultiplicandIterator may only be instantiated for "
                 "A or B operands to warp-level Mma.");

   using Element = Element_;

   static int const kCrosswise = Crosswise;

   using Layout = cutlass::layout::TensorOpMultiplicandCrosswise<
       sizeof_bits<Element_>::value, kCrosswise>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   static int const kPartitionsK = PartitionsK_;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   struct Policy {
     static_assert(
         !(Shape::kContiguous % InstructionShape::kContiguous),
         "Shape of warp-level Mma must be divisible by operator shape.");

     // Determine number of elements along outer dimension per individual LDSM op
     static int const kLdsmOpOuter = Layout::kElementsPerAccess;
     static int const kLdsmOpInner = 8;

     static_assert(!(Shape::kContiguous % kLdsmOpOuter),
                   "Shape of warp-level mma must be divisible by LDSM's "
                   "fundamental tile size.");

     static_assert(!(Shape::kStrided % kLdsmOpInner),
                   "Shape of warp-level mma must be divisible by LDSM's "
                   "fundamental tile size.");

     static int const LdsmShapeContiguous =
         InstructionShape::kContiguous / kLdsmOpOuter;
     static int const LdsmShapeStrided =
         ((4 / LdsmShapeContiguous * kLdsmOpInner) > Shape::kStrided)
             ? (Shape::kStrided / kLdsmOpInner)
             : (4 / LdsmShapeContiguous);
     using LdsmShape =
         layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;

     using LdsmIterations =
         layout::PitchLinearShape<1, Shape::kStrided / kLdsmOpInner /
                                         LdsmShape::kStrided>;

     static int const kGroupsPerTile = Layout::TileShape::kContiguous /
                                       Layout::kFactor / LdsmShape::kContiguous;
   };

  private:
   static_assert(kOpDelta == 1,
                 "Alternative arrangements not supported at present.");

   using AccessType = Array<Element, Layout::kElementsPerAccess>;

  public:
   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads>;

  private:

   int sections_;

   Index stride_;

   AccessType const *pointer_;

   Index byte_offset_;

   int k_group_idx_;

  public:
   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator()
       : pointer_(nullptr),
         sections_(0),
         stride_(0),
         byte_offset_(0),
         k_group_idx_(0) {}

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
       : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
         sections_(ref.stride(0) / kCrosswise),
         // stride_ = kCrosswise x sections_ x kFactor
         stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
         byte_offset_(0),
         k_group_idx_(0) {
     // Warp level iterator at most use double buffer to hide latency.  If there
     // are more than 2 sections, every stage should have more than 1 section.
     // TODO: refactor code after every case is implemented

     // Turing silicon requires all 32 threads in a warp provide valid addresses
     // even for LDSM.1 and LDSM.2
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 750))
     lane_id = lane_id % (Policy::LdsmShape::kCount * Policy::kLdsmOpInner);
 #endif

     int lane_in_pair = (lane_id & 1);
     int lane_in_quad = (lane_id & 3);
     int lane_in_quad_pair = (lane_id & 7);
     int lane_in_quad_quad = (lane_id & 15);

     int partition_contiguous_idx = -1;
     int access_contiguous_idx = -1;
     int access_strided_idx = -1;

     if (Layout::kFactor == 4) {
       // Super Integer matrix multiply Interleaved-32

       int factor_in_partition =
           (Layout::PartitionShape::kContiguous * Layout::kFactor /
            Layout::TileShape::kContiguous);

       if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
         // Integer matrix multiply 8816  A/B
         partition_contiguous_idx = lane_in_quad / factor_in_partition;
         access_contiguous_idx = ((lane_in_pair * factor_in_partition) ^
                                  (lane_in_quad_quad / Layout::kFactor));
         access_strided_idx = lane_id / Layout::kFactor;
       }
     } else if (Layout::kFactor == 2) {
       // Super Matrix multiply kBlock = 32
       if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
         // (Q stands for 1 8x128bit block).
         // Q0
         // Q1
         // Q2
         // Q3
         // Four blocks are next to each other in the strided dimension.
         partition_contiguous_idx = (lane_id % Layout::kFactor);
         access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor);
         access_strided_idx = lane_id / Layout::kFactor;
       }
     } else if (Layout::kFactor == 1) {
       // Super Matrix multiply kBlock = 64
       if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
         // Q0
         // Q1
         // Q2
         // Q3
         partition_contiguous_idx = (lane_in_quad_pair >> 2);
         access_contiguous_idx = lane_in_quad;
         access_strided_idx = lane_id;
       }
     }

     int access_contiguous =
         partition_contiguous_idx * Layout::PartitionShape::kContiguous +
         access_contiguous_idx;

     int access_strided = access_strided_idx;

     byte_offset_ = (access_contiguous + access_strided * stride_) *
                    sizeof_bits<Element>::value * Layout::kElementsPerAccess / 8;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
     byte_offset_ += offset * sizeof_bits<Element>::value / 8;

     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_tile_offset(
       TensorCoord const &tile_offset) {
     int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
     int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;

     byte_offset_ ^= k_groups_delta * sizeof_bits<Element>::value *
                     Layout::kElementsPerAccess / 8;
     pointer_ +=
         tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor +
         whole_tiles * stride_ / sections_;
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator++() {

     // Integer matrix multiply 8816  Interleaved-32
     //   ^1 ^1
     // Matrix multiply 1688 kblock=32 || Integer matrix multiply 8816 kblock=64
     //   ^1 ^3 ^1 ^3
     // Matrix multiply 1688 kblock=64
     //   ^1 ^3 ^1 ^7 ^1 ^3 ^1 ^7
     if ((Policy::kGroupsPerTile / kPartitionsK) > 1) {
       int mask = ((Policy::kGroupsPerTile / kPartitionsK) == 8)
                      ? 3
                      : (((Policy::kGroupsPerTile / kPartitionsK) == 4) ? 1 : 0);

       if (((k_group_idx_ & mask) % 2) == 0)
         byte_offset_ ^= 1 * Policy::LdsmShape::kContiguous *
                         sizeof_bits<Element>::value *
                         Layout::kElementsPerAccess / 8;
       else if ((k_group_idx_ & mask) == 1)
         byte_offset_ ^= 3 * Policy::LdsmShape::kContiguous *
                         sizeof_bits<Element>::value *
                         Layout::kElementsPerAccess / 8;
       else if ((k_group_idx_ & mask) == 3)
         byte_offset_ ^= 7 * Policy::LdsmShape::kContiguous *
                         sizeof_bits<Element>::value *
                         Layout::kElementsPerAccess / 8;
     }

     k_group_idx_++;

     if (k_group_idx_ == (Policy::kGroupsPerTile / kPartitionsK)) {
       k_group_idx_ = 0;
       add_tile_offset({Policy::kGroupsPerTile, 0});
     }

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator--() { assert(0); }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator+=(
       TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator-=(
       TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {
     Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
         reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
         int access_idx = c + s * Policy::LdsmIterations::kContiguous;

         AccessType const *source_ptr =
             pointer_ + Policy::LdsmShape::kContiguous * c +
             Policy::kLdsmOpInner / Layout::kFactor *
                 Policy::LdsmShape::kStrided * s * stride_;

         char const *source_byte_ptr =
             reinterpret_cast<char const *>(source_ptr) + byte_offset +
             byte_offset_;

         cutlass::arch::ldsm<layout::RowMajor, Policy::LdsmShape::kCount>(
             fetch_ptr[access_idx], source_byte_ptr);
       }
     }
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     load_with_byte_offset(frag, pointer_offset * sizeof(Element));
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     load_with_byte_offset(frag, tile_offset, 0);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     Index pointer_offset = tile_offset.contiguous() *
                                InstructionShape::kContiguous /
                                Layout::kElementsPerAccess +
                            tile_offset.strided() * Shape::kStrided * stride_;

     byte_offset += sizeof_bits<AccessType>::value * pointer_offset / 8;

     load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     k_group_idx_ = k_group % (Policy::kGroupsPerTile / kPartitionsK);
   }
 };


 template <
     typename Shape_,
     Operand Operand_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_,
     int Crosswise,
     int PartitionsK_>
 class MmaTensorOpMultiplicandTileIterator<
     Shape_, Operand_, Element_,
     cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
         sizeof_bits<Element_>::value, Crosswise>,
     InstructionShape_, OpDelta_, 32, PartitionsK_> {
  public:
   using Shape = Shape_;

   static Operand const kOperand = Operand_;

   static_assert(kOperand == Operand::kB,
                 "MmaTensorOpMultiplicandIterator for ColumnMajor Crosswise may "
                 "only be instantiated for B operand to warp-level Mma.");

   using Element = Element_;

   static int const kCrosswise = Crosswise;

   using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
       sizeof_bits<Element_>::value, kCrosswise>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   using Base = MmaTensorOpMultiplicandTileIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
       layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                             kCrosswise>,
       layout::PitchLinearShape<InstructionShape::kRow,
                                InstructionShape::kColumn>,
       kOpDelta, kThreads, PartitionsK_>;

  public:
   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads>;

  private:
   Base iterator_;

  public:
   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator() {}

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
       : iterator_({ref.data(), ref.stride()}, lane_id) {}

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
     iterator_.add_pointer_offset(offset);

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_tile_offset(
       TensorCoord const &tile_offset) {
     iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator++() {
     ++iterator_;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator--() {
     --iterator_;

     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator+=(
       TensorCoord const &tile_offset) {
     add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator-=(
       TensorCoord const &tile_offset) {
     add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const { iterator_.load(frag); }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     // TODO
     assert(0);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     // TODO
     assert(0);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(
         frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     iterator_.set_kgroup_index(k_group);
   }
 };


 template <
     typename Shape_,
     Operand Operand_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_,
     int Crosswise,
     int PartitionsK_>
 class MmaTensorOpMultiplicandTileIterator<
     Shape_, Operand_, Element_,
     cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
         sizeof_bits<Element_>::value, Crosswise>,
     InstructionShape_, OpDelta_, 32, PartitionsK_> {
  public:
   using Shape = Shape_;

   static Operand const kOperand = Operand_;

   static_assert(kOperand == Operand::kA,
                 "MmaTensorOpMultiplicandIterator for RowMajor Crosswise may "
                 "only be instantiated for A operand to warp-level Mma.");

   using Element = Element_;

   static int const kCrosswise = Crosswise;

   using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
       sizeof_bits<Element_>::value, kCrosswise>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   using Base = MmaTensorOpMultiplicandTileIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
       layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                             kCrosswise>,
       layout::PitchLinearShape<InstructionShape::kColumn,
                                InstructionShape::kRow>,
       kOpDelta, kThreads, PartitionsK_>;

  public:
   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads>;

  private:
   Base iterator_;

  public:
   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator() {}

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
       : iterator_({ref.data(), ref.stride()}, lane_id) {}

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
     iterator_.add_pointer_offset(offset);

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_tile_offset(
       TensorCoord const &tile_offset) {
     iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator++() {
     ++iterator_;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator--() {
     --iterator_;

     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator+=(
       TensorCoord const &tile_offset) {
     add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator-=(
       TensorCoord const &tile_offset) {
     add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const { iterator_.load(frag); }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     // TODO
     assert(0);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     // TODO
     assert(0);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(
         frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     iterator_.set_kgroup_index(k_group);
   }
 };

 template <
     typename Shape_,
     typename Element_,
     typename Layout_,
     typename InstructionShape_,
     typename OpDelta_>
 class MmaTensorOpAccumulatorTileIterator;


 template <
     typename Shape_,
     typename Element_,
     typename InstructionShape_,
     typename OpDelta_>
 class MmaTensorOpAccumulatorTileIterator<
     Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kC;

   using Element = Element_;

   using Layout = cutlass::layout::RowMajor;

   using InstructionShape = InstructionShape_;

   using OpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   struct Policy {
     static_assert(
         !(Shape::kRow % InstructionShape::kM) &&
             !(Shape::kColumn % InstructionShape::kN),
         "Shape of warp-level Mma must be divisible by operator shape.");

     static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
       "Layouts must be defined for logical MatrixCoord coordinate space.");

     using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
                                       Shape::kColumn / InstructionShape::kN>;
   };

 private:

   // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
   // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
   // of that row. The accumulators within one row are assumed to be consecutive.
  static int const kElementsPerAccess = InstructionShape::kN / 4;
  static int const kRowsPerTile = 8;
  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;

 public:

   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads>;

 private:

   TensorRef ref_;

 public:

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator(
     TensorRef const &ref,
     int lane_id
   ):
     ref_(ref) {

     int quad = (lane_id >> 2);
     int lane_in_quad = (lane_id & 3);

     MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);

     ref_.add_coord_offset(lane_offset);
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
     ref_.add_pointer_offset(offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator & operator++() {
     // deliberate no-op
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator & operator--() {
     // deliberate no-op
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
     Fragment &frag,
     Index pointer_offset) const {

     TensorRef offset_ref(ref_);
     offset_ref.add_pointer_offset(pointer_offset);

     CUTLASS_PRAGMA_UNROLL
     for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
       CUTLASS_PRAGMA_UNROLL
       for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {

         int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
           (mma_n * Policy::MmaIterations::kRow + mma_m);

         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < kAccumulatorRows; ++row) {
           CUTLASS_PRAGMA_UNROLL
           for (int col = 0; col < kElementsPerAccess; ++col) {
             int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
                           row * kRowsPerTile;
             int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;

             frag[mma_accum_start + row * kElementsPerAccess + col] = offset_ref.at({accum_m, accum_n});
           }
         }
       }
     }
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
     Fragment &frag,
     Index byte_offset) const {

     load_with_pointer_offset(byte_offset / sizeof(Element));
   }

   CUTLASS_DEVICE
   void load(
     Fragment &frag,
     TensorCoord const &tile_offset) const {

     load(frag, tile_offset, 0);
   }

   CUTLASS_DEVICE
   void load(
     Fragment &frag,
     TensorCoord const &tile_offset,
     Index pointer_offset) const {

     load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) const {
     store_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void store_with_pointer_offset(
     Fragment const &frag,
     Index pointer_offset) const {

     TensorRef offset_ref(ref_);
     offset_ref.add_pointer_offset(pointer_offset);

     CUTLASS_PRAGMA_UNROLL
     for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
       CUTLASS_PRAGMA_UNROLL
       for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {

         int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
           (mma_n * Policy::MmaIterations::kRow + mma_m);

         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < kAccumulatorRows; ++row) {
           CUTLASS_PRAGMA_UNROLL
           for (int col = 0; col < kElementsPerAccess; ++col) {
             int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
                           row * kRowsPerTile;
             int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
             int idx = mma_accum_start + row * kElementsPerAccess + col;

             offset_ref.at({accum_m, accum_n}) = frag[idx];
           }
         }
       }
     }
   }

   CUTLASS_DEVICE
   void store_with_byte_offset(
     Fragment const &frag,
     Index byte_offset) const {

     store_with_pointer_offset(byte_offset / sizeof(Element));
   }

   CUTLASS_DEVICE
   void store(
     Fragment &frag,
     TensorCoord const &tile_offset) const {

     store(frag, tile_offset, 0);
   }

   CUTLASS_DEVICE
   void store(
       Fragment const &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
   }
 };


 template <
     typename Shape_,
     typename Element_,
     typename InstructionShape_,
     typename OpDelta_>
 class MmaTensorOpAccumulatorTileIterator<Shape_, Element_,
                                          cutlass::layout::ColumnMajor,
                                          InstructionShape_, OpDelta_> {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kC;

   using Element = Element_;

   using Layout = cutlass::layout::ColumnMajor;

   using InstructionShape = InstructionShape_;

   using OpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   struct Policy {
     static_assert(
         !(Shape::kRow % InstructionShape::kM) &&
             !(Shape::kColumn % InstructionShape::kN),
         "Shape of warp-level Mma must be divisible by operator shape.");

     static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
       "Layouts must be defined for logical MatrixCoord coordinate space.");

     using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
                                       Shape::kColumn / InstructionShape::kN>;
   };

 private:

   // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
   // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
   // of that row. The accumulators within one row are assumed to be consecutive.
  static int const kElementsPerAccess = InstructionShape::kN / 4;
  static int const kRowsPerTile = 8;
  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;

 public:

   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads>;

 private:

   TensorRef ref_;

 public:

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator(
     TensorRef const &ref,
     int lane_id
   ):
     ref_(ref) {

     int quad = (lane_id >> 2);
     int lane_in_quad = (lane_id & 3);

     MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);

     ref_.add_coord_offset(lane_offset);
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
     ref_.add_pointer_offset(offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator & operator++() {
     // deliberate no-op
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator & operator--() {
     // deliberate no-op
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
     Fragment &frag,
     Index pointer_offset) const {

     TensorRef offset_ref(ref_);
     offset_ref.add_pointer_offset(pointer_offset);

     CUTLASS_PRAGMA_UNROLL
     for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
       CUTLASS_PRAGMA_UNROLL
       for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {

         int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
           (mma_n * Policy::MmaIterations::kRow + mma_m);

         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < kAccumulatorRows; ++row) {
           CUTLASS_PRAGMA_UNROLL
           for (int col = 0; col < kElementsPerAccess; ++col) {
             int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
                           row * kRowsPerTile;
             int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
             int idx = mma_accum_start + row * kElementsPerAccess + col;

             frag[idx] = offset_ref.at({accum_m, accum_n});
           }
         }
       }
     }
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
     Fragment &frag,
     Index byte_offset) const {

     load_with_pointer_offset(byte_offset / sizeof(Element));
   }

   CUTLASS_DEVICE
   void load(
     Fragment &frag,
     TensorCoord const &tile_offset) const {

     load(frag, tile_offset, 0);
   }

   CUTLASS_DEVICE
   void load(
     Fragment &frag,
     TensorCoord const &tile_offset,
     Index pointer_offset) const {

     load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) const {
     store_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void store_with_pointer_offset(
     Fragment const &frag,
     Index pointer_offset) const {

     TensorRef offset_ref(ref_);
     offset_ref.add_pointer_offset(pointer_offset);

     CUTLASS_PRAGMA_UNROLL
     for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
       CUTLASS_PRAGMA_UNROLL
       for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {

         int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
           (mma_n * Policy::MmaIterations::kRow + mma_m);

         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < kAccumulatorRows; ++row) {
           CUTLASS_PRAGMA_UNROLL
           for (int col = 0; col < kElementsPerAccess; ++col) {
             int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
                           row * kRowsPerTile;
             int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
             int idx = mma_accum_start + row * kElementsPerAccess + col;

             offset_ref.at({accum_m, accum_n}) = frag[idx];
           }
         }
       }
     }
   }

   CUTLASS_DEVICE
   void store_with_byte_offset(
     Fragment const &frag,
     Index byte_offset) const {

     store_with_pointer_offset(byte_offset / sizeof(Element));
   }

   CUTLASS_DEVICE
   void store(
     Fragment &frag,
     TensorCoord const &tile_offset) const {

     store(frag, tile_offset, 0);
   }

   CUTLASS_DEVICE
   void store(
       Fragment const &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
   }
 };


 template <
     typename Shape_,
     typename Element_,
     typename InstructionShape_,
     typename OpDelta_,
     int InterleavedN>
 class MmaTensorOpAccumulatorTileIterator<
     Shape_, Element_, cutlass::layout::ColumnMajorInterleaved<InterleavedN>,
     InstructionShape_, OpDelta_> {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kC;

   using Element = Element_;

   using Layout = cutlass::layout::ColumnMajorInterleaved<InterleavedN>;

   using InstructionShape = InstructionShape_;

   using OpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   struct Policy {
     static_assert(
         !(Shape::kRow % InstructionShape::kM) &&
             !(Shape::kColumn % InstructionShape::kN),
         "Shape of warp-level Mma must be divisible by operator shape.");

     static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
       "Layouts must be defined for logical MatrixCoord coordinate space.");

     using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
                                       Shape::kColumn / InstructionShape::kN>;
   };

 private:

   static int const kElementsPerAccess = 2;

 public:

   //
   // Derived quantities
   //

   using AccessType = Array<Element, kElementsPerAccess>;

   using Fragment = Array<Element, Shape::kCount / kThreads>;

 private:

   TensorRef ref_;

 public:

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator(
     TensorRef const &ref,
     int lane_id
   ):
     ref_(ref) {

     int quad = (lane_id >> 2);
     int lane_in_quad = (lane_id & 3);

     MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);

     ref_.add_coord_offset(lane_offset);
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
     ref_.add_pointer_offset(offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator & operator++() {
     // deliberate no-op
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpAccumulatorTileIterator & operator--() {
     // deliberate no-op
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
     Fragment &frag,
     Index pointer_offset) const {

     TensorRef offset_ref(ref_);
     offset_ref.add_pointer_offset(pointer_offset);

     AccessType* frag_ptr = reinterpret_cast<AccessType *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
       CUTLASS_PRAGMA_UNROLL
       for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
         int accum_m = mma_m * InstructionShape::kM;
         int accum_n = mma_n * InstructionShape::kN;

         int idx = mma_m + mma_n * Policy::MmaIterations::kRow;

         AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
           offset_ref.offset(TensorCoord(accum_m, accum_n)));

         frag_ptr[idx] = access_ptr[0];
       }
     }
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
     Fragment &frag,
     Index byte_offset) const {

     load_with_pointer_offset(byte_offset / sizeof(Element));
   }

   CUTLASS_DEVICE
   void load(
     Fragment &frag,
     TensorCoord const &tile_offset) const {

     load(frag, tile_offset, 0);
   }

   CUTLASS_DEVICE
   void load(
     Fragment &frag,
     TensorCoord const &tile_offset,
     Index pointer_offset) const {

     load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) const {
     store_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void store_with_pointer_offset(
     Fragment const &frag,
     Index pointer_offset) const {

     TensorRef offset_ref(ref_);
     offset_ref.add_pointer_offset(pointer_offset);

     AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
       CUTLASS_PRAGMA_UNROLL
       for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
         int accum_m = mma_m * InstructionShape::kM;
         int accum_n = mma_n * InstructionShape::kN;

         int idx = mma_m + mma_n * Policy::MmaIterations::kRow;

   AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
          offset_ref.offset(TensorCoord(accum_m, accum_n)));

         access_ptr[0] = frag_ptr[idx];
       }
     }
   }

   CUTLASS_DEVICE
   void store_with_byte_offset(
     Fragment const &frag,
     Index byte_offset) const {

     store_with_pointer_offset(byte_offset / sizeof(Element));
   }

   CUTLASS_DEVICE
   void store(
     Fragment &frag,
     TensorCoord const &tile_offset) const {

     store(frag, tile_offset, 0);
   }

   CUTLASS_DEVICE
   void store(
       Fragment const &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
   }
 };

 } // namespace warp
 } // namespace gemm
 } // namespace cutlass

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::OpDelta
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator.h:1836

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::Shape
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:2119

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:1389

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_pointer_offset
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:1139

cutlass::MatrixShape
Describes the size of a matrix tile.
Definition: matrix_shape.h:42

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:1652

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:1915

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Element
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator.h:476

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:1052

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:2433

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:483

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1514

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1259

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:966

tensor_op_multiplicand_sm75.h

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:223

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator--
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:569

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator+=
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1207

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:734

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_pointer_offset
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:257

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::MmaTensorOpAccumulatorTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:2496

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::store
CUTLASS_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const
Stores a fragment to memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2073

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::store
CUTLASS_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const
Stores a fragment to memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2661

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::operator--
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:2236

cutlass::TensorRef::data
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254

cutlass::platform::is_same
std::is_same (false specialization)
Definition: platform.h:394

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:765

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator++
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1163

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:773

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:2296

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:1845

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::store
CUTLASS_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Stores a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2381

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Fragment
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator.h:1026

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:219

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:732

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator-=
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:328

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator+=
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:321

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::operator-=
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1952

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:408

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:1418

cutlass::make_Coord
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:387

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator+=
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1692

cutlass::gemm::Operand
Operand
GEMM operand enumeration: D = A * B + C.
Definition: include/cutlass/gemm/gemm.h:39

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::store
CUTLASS_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Stores a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2082

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:422

memory_sm75.h
Architecture-specific operators on memory added for SM75.

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:835

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1279

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:1616

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:872

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:1224

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator++
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:293

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:840

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1965

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:1386

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::add_tile_offset
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1922

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::operator+=
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:1945

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::layout::TensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:213

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:432

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:1648

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator.h:1583

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:2143

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:1833

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator--
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1683

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:2562

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator-=
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:816

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2031

platform.h
C++ features that may be otherwise unimplemented for CUDA device functions.

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:2146

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:2263

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:726

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator+=
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:809

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::operator+=
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:2243

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::MmaTensorOpAccumulatorTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:1895

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::Fragment
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator.h:1884

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:860

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:599

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator-=
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1216

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:609

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1757

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::operator++
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:2229

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator--
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the opposite of the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:312

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::add_tile_offset
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2519

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:1546

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:714

cutlass::TensorRef::add_coord_offset
CUTLASS_HOST_DEVICE TensorRef & add_coord_offset(TensorCoord const &coord)
Adds an offset to each pointer.
Definition: tensor_ref.h:326

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:641

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_tile_offset
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:266

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:2556

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1723

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::operator--
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1938

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:2445

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::MmaTensorOpAccumulatorTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:2197

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:830

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::operator++
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:2528

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_tile_offset
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:782

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::gemm::Operand::kC
B multiplicand.

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_tile_offset
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1666

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Fragment
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator.h:202

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1503

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1483

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::add_tile_offset
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2220

tensor.h
Defines layout functions used by TensorRef and derived classes for common 4-D and 5-D tensor formats...

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:1776

cutlass::gemm::Operand::kA

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator.h:466

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::OpDelta
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator.h:2436

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:1851

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::store_with_byte_offset
CUTLASS_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2652

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::MmaTensorOpAccumulatorTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:2193

cutlass::TensorRef::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::store
CUTLASS_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const
Stores a fragment to memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2372

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:1427

cutlass::TensorRef::TensorCoord
typename Layout::TensorCoord TensorCoord
Coordinate in logical tensor space.
Definition: tensor_ref.h:171

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory.
Definition: mma_tensor_op_tile_iterator.h:2025

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:534

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:386

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1527

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:342

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1291

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1997

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator-=
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1471

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator
Definition: mma_tensor_op_tile_iterator.h:1794

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Fragment
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator.h:1409

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Fragment
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator.h:750

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:1622

nullptr
#define nullptr
nullptr
Definition: platform.h:144

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:1657

cutlass::TensorRef< Element, Layout >

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:761

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1228

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::operator-=
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:2549

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory.
Definition: mma_tensor_op_tile_iterator.h:2618

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:823

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator++
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1675

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::store_with_byte_offset
CUTLASS_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2064

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_tile_offset
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1436

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator.h:107

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator++
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:791

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:138

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:1373

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:662

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::Shape
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:2421

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:969

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::OpDelta
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator.h:2134

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:1316

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator++
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:560

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:893

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::TensorRef::offset
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator
Definition: mma_tensor_op_tile_iterator.h:75

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:2512

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator.h:930

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:141

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2599

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1493

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:1959

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::operator++
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1931

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2608

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator++
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1445

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator--
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1453

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::store_with_byte_offset
CUTLASS_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2363

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2305

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator-=
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:585

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape)
Definition: mma_tensor_op_tile_iterator.h:123

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:542

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:729

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:1848

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:1709

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Fragment
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator.h:519

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:1619

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:495

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:1422

cutlass::TensorRef::Index
typename Layout::Index Index
Index type.
Definition: tensor_ref.h:165

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1744

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2015

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:2131

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1269

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::Shape
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:1821

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape)
Definition: mma_tensor_op_tile_iterator.h:950

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:530

cutlass::TensorRef::at
CUTLASS_HOST_DEVICE Reference at(TensorCoord const &coord) const
Returns a reference to the element at a given Coord.
Definition: tensor_ref.h:307

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory.
Definition: mma_tensor_op_tile_iterator.h:2324

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:376

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:2257

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::store
CUTLASS_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Stores a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2670

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator.h:1353

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:396

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator.h:697

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator-=
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1701

fast_math.h
Math utilities.

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_tile_offset
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1148

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator+=
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:578

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::AccessType
Array< Element, kElementsPerAccess > AccessType
Definition: mma_tensor_op_tile_iterator.h:2478

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:498

pitch_linear.h
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::layout::ColumnMajorInterleaved
Definition: layout/matrix.h:343

cutlass::TensorRef::add_pointer_offset
CUTLASS_HOST_DEVICE TensorRef & add_pointer_offset(LongIndex offset_)
Adds an offset to each pointer.
Definition: tensor_ref.h:319

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Fragment
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator.h:1639

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:972

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1733

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:335

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:1479

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2330

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1713

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:1061

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:2590

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::MmaTensorOpAccumulatorTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:1899

cutlass::layout::TensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:632

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Element
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator.h:116

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:2448

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:629

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:1392

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::operator-=
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:2250

cutlass::gemm::Operand::kB
A multiplicand.

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:850

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator--
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1202

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:619

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:2149

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::Fragment
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator.h:2182

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2314

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::operator--
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:2535

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator--
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:800

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::MmaTensorOpAccumulatorTileIterator
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:2492

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:144

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:592

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator+=
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1462

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::operator+=
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:2542

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:501

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_ >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2006

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2624

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:2451

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:1603

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajorInterleaved< InterleavedN >, InstructionShape_, OpDelta_ >::Fragment
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator.h:2481

cutlass::TensorRef::LongIndex
typename Layout::LongIndex LongIndex
Long index used for pointer offsets.
Definition: tensor_ref.h:168

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_tile_offset
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:551

cutlass::layout::RowMajorTensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:527

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:2213