cutlass/mma__tensor__op__tile__iterator__sm70_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"

 #include "cutlass/array.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/matrix_shape.h"

 #include "cutlass/gemm/gemm.h"

 #include "cutlass/layout/matrix.h"
 #include "cutlass/layout/pitch_linear.h"
 #include "cutlass/layout/tensor_op_multiplicand_sm70.h"

 #include "cutlass/platform/platform.h"


 namespace cutlass {
 namespace gemm {
 namespace warp {


 template <
     typename Shape_,
     Operand Operand,
     typename Element_,
     typename Layout_,
     typename InstructionShape_,
     int OpDelta_,
     int Threads>
 class MmaVoltaTensorOpMultiplicandTileIterator;


 template <
     typename Shape_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_>
 class MmaVoltaTensorOpMultiplicandTileIterator<
     Shape_, Operand::kA, Element_,
     cutlass::layout::VoltaTensorOpMultiplicandCongruous<
         sizeof_bits<Element_>::value>,
     InstructionShape_, OpDelta_, 32> {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kA;

   using Element = Element_;

   using Layout = cutlass::layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   struct Policy {
     static_assert(
         !(Shape::kContiguous % InstructionShape::kContiguous),
         "Shape of warp-level Mma must be divisible by operator shape.");

     // Shape of one individual LDS.128
     // TODO: 32 and 4 are hardcoded, 32-by-4 is logical shape
     using LdsShape = layout::PitchLinearShape<
       32,
       4
     >;

     // LdsShapes are arranged in the strided direction in SMEM
     using LdsIterations = layout::PitchLinearShape<
       InstructionShape::kStrided / LdsShape::kStrided,
       Shape::kContiguous / LdsShape::kContiguous
     >;
   };

 private:

   static_assert(kOpDelta == 1,
     "Alternative arrangements not supported at present.");

   static int const kPointerCount = 2;

   using AccessType = AlignedArray<Element, Layout::kElementsPerAccess>;

 public:

   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads * 2>;

 private:

   Index stride_;

   AccessType const *pointer_[kPointerCount];

   Index byte_offset_;

 public:

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator(
     TensorRef const &ref,
     int lane_id
   ):
     stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
     // swizzle patterns for operandA LDS are
     // 1. (tid[4] << 3) | (tid[2:0] ^ tid[4])
     // 2. (tid[4] << 3) | (tid[2:0] ^ tid[4] ^ 0b10010)

     int vec_row = (lane_id >> 4); // tid[4]
     int vec_col = ((lane_id & 4) >> 2); // tid[2]

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kPointerCount; ++i) {

       if(i == 1) {
         vec_row |= 2;
       }
       int access_contiguous_idx = (vec_col << 2) | ((lane_id & 3) ^ vec_row);
       int access_contiguous = access_contiguous_idx;

       int access_strided = vec_row;
       pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
         access_contiguous + access_strided * stride_;
     }

   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {

     byte_offset_ += offset * sizeof(Element);

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     int contiguous_offset = tile_offset.contiguous();
     int strided_offset = tile_offset.strided();

     // To support 32x32 tile size
     if (Shape::kContiguous == Policy::LdsShape::kContiguous) {
       if (contiguous_offset % 2) {
         AccessType const *tmp_pointer = pointer_[0];
         pointer_[0] = pointer_[1];
         pointer_[1] = tmp_pointer;
       }
       contiguous_offset = contiguous_offset / 2;
     }

     int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
                      Layout::kElementsPerAccess +
                  contiguous_offset * Shape::kContiguous;

     add_pointer_offset(offset);

     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
     byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
                     Layout::kElementsPerAccess;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
     byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
                     Layout::kElementsPerAccess;

     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {

     load_with_byte_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {

     AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {

       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {

         int access_idx = c + s * Policy::LdsIterations::kContiguous;

         AccessType const *source_ptr = pointer_[s & 1] +
           Policy::LdsShape::kContiguous * c +
           Policy::LdsShape::kStrided * (s / 2) * stride_;

         char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
         fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
       }
     }
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     load_with_byte_offset(frag, pointer_offset * sizeof(Element));
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     load_with_byte_offset(frag, tile_offset, 0);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     Index pointer_offset =
         tile_offset.contiguous() * Shape::kContiguous /
             Layout::kElementsPerAccess +
         tile_offset.strided() * InstructionShape::kStrided * stride_;

     byte_offset += sizeof(AccessType) * pointer_offset;

     load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     // no operation here
   }
 };


 template <
     typename Shape_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_>

 class MmaVoltaTensorOpMultiplicandTileIterator<
     Shape_, Operand::kB, Element_,
     cutlass::layout::VoltaTensorOpMultiplicandBCongruous<
         sizeof_bits<Element_>::value>,
     InstructionShape_, OpDelta_, 32> {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kB;

   using Element = Element_;

   using Layout = cutlass::layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   struct Policy {
     static_assert(
         !(Shape::kContiguous % InstructionShape::kContiguous),
         "Shape of warp-level Mma must be divisible by operator shape.");

     // Shape of one individual LDS
     // TODO: remove hardcoded 32 and 4
     using LdsShape = layout::PitchLinearShape<
       32,
       4
     >;

     using LdsIterations = layout::PitchLinearShape<
       Shape::kContiguous / LdsShape::kContiguous,
       InstructionShape::kStrided / LdsShape::kStrided
     >;
   };

 private:

   static_assert(kOpDelta == 1,
     "Alternative arrangements not supported at present.");

   using AccessType = AlignedArray<Element, Layout::kElementsPerAccess>;

 public:

   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads * 2>;

 private:

   Index stride_;

   AccessType const *pointer_;

   Index byte_offset_;

 public:

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator(
     TensorRef const &ref,
     int lane_id
   ):
     stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {

     // swizzle pattern is (tid & (3 << 3) | (tid[1:0] ^ tid[4:3]))
     int access_strided = (lane_id >> 3) & 0x3;
     int access_contiguous = ((lane_id ^ (lane_id >> 3)) & 0x3);

     pointer_ = reinterpret_cast<AccessType const *>(ref.data()) +
                 access_contiguous + access_strided * stride_;

   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {

     byte_offset_ += offset * sizeof(Element);

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     int contiguous_offset = tile_offset.contiguous();
     int strided_offset = tile_offset.strided();

     int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
                      Layout::kElementsPerAccess +
                  contiguous_offset * Shape::kContiguous;

     add_pointer_offset(offset);

     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
     byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
                     Layout::kElementsPerAccess;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
     byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
                     Layout::kElementsPerAccess;

     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {

     load_with_byte_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {

     AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {

       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {

         int access_idx = c + s * Policy::LdsIterations::kContiguous;

         AccessType const *source_ptr = pointer_ +
           Policy::LdsShape::kContiguous / Layout::kElementsPerAccess * c +
           Policy::LdsShape::kStrided * s * stride_;

         char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
         fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
       }
     }
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     load_with_byte_offset(frag, pointer_offset * sizeof(Element));
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     load_with_byte_offset(frag, tile_offset, 0);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     Index pointer_offset =
         tile_offset.contiguous() * Shape::kContiguous /
             Layout::kElementsPerAccess +
         tile_offset.strided() * InstructionShape::kStrided * stride_;

     byte_offset += sizeof(AccessType) * pointer_offset;

     load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     // no operation here
   }
 };


 template <
     typename Shape_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_>
 class MmaVoltaTensorOpMultiplicandTileIterator<
     Shape_, Operand::kA, Element_,
     cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<
         sizeof_bits<Element_>::value>,
     InstructionShape_, OpDelta_, 32> {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kA;

   using Element = Element_;

   using Layout = cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   using Base = MmaVoltaTensorOpMultiplicandTileIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
       layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
       layout::PitchLinearShape<InstructionShape::kRow,
                                InstructionShape::kColumn>,
       kOpDelta, kThreads>;

  public:

   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads * 2>;

 private:

   Base iterator_;

 public:

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator(
     TensorRef const &ref,
     int lane_id
   ): iterator_({ref.data(), ref.stride()}, lane_id) {
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {

     iterator_.add_pointer_offset(offset);

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator++() {

     ++iterator_;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator--() {

     --iterator_;

     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {

     iterator_.load(frag);
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     // TODO
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     // TODO
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(
       frag,
       {tile_offset.contiguous(), tile_offset.strided()},
       byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     iterator_.set_kgroup_index(k_group);
   }
 };


 template <
     typename Shape_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_>
 class MmaVoltaTensorOpMultiplicandTileIterator<
     Shape_, Operand::kB, Element_,
     cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous<
         sizeof_bits<Element_>::value>,
     InstructionShape_, OpDelta_, 32> {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kB;

   static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
     "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");

   using Element = Element_;

   using Layout = cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   using Base = MmaVoltaTensorOpMultiplicandTileIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
       layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
       layout::PitchLinearShape<InstructionShape::kColumn,
                                InstructionShape::kRow>,
       kOpDelta, kThreads>;

  public:

   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads * 2>;

 private:

   Base iterator_;

 public:

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator(
     TensorRef const &ref,
     int lane_id
   ): iterator_({ref.data(), ref.stride()}, lane_id) {
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {

     iterator_.add_pointer_offset(offset);

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator++() {

     ++iterator_;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator--() {

     --iterator_;

     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {

     iterator_.load(frag);
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     // TODO
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     // TODO
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(
       frag,
       {tile_offset.strided(), tile_offset.contiguous()},
       byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     iterator_.set_kgroup_index(k_group);
   }
 };


 template <
     typename Shape_,
     typename Element_,
     typename Layout_,
     typename InstructionShape_,
     typename OpDelta_>
 class MmaVoltaTensorOpAccumulatorTileIterator {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kC;

   using Element = Element_;

   using Layout = Layout_;

   using InstructionShape = InstructionShape_;

   using OpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   struct Policy {

     using InterleavedTile = MatrixShape<32, 32>;

     static_assert(!(Shape::kRow % InterleavedTile::kRow) && !(Shape::kColumn % InterleavedTile::kColumn),
       "Shape of warp-level Mma must be divisible by operator shape.");

     static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
       "Layouts must be defined for logical MatrixCoord coordinate space.");

     using TileIterations = MatrixShape<
       Shape::kRow / InterleavedTile::kRow,
       Shape::kColumn / InterleavedTile::kColumn
     >;

     using MmaIterations =
         MatrixShape<InterleavedTile::kRow / InstructionShape::kM,
                     InterleavedTile::kColumn / InstructionShape::kN>;
   };

 private:

   // Assume accumulator tile is multipile interleaved 32x32 tile.
   static int const kElementsPerPartial = 4;
   using EleShapePerPatial = typename platform::conditional<
                               platform::is_same<Element, float>::value,
                               MatrixShape<2, 2>,
                               MatrixShape<1, 4> >::type;
   static int const kElementsPerMma = 8;
   static int const kAccumulatorPatials = 2;
   using QuadShapePerPatialMma = MatrixShape<4, 4>;

 public:

   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads>;

 private:

   TensorRef ref_;

 public:

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpAccumulatorTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpAccumulatorTileIterator(
     TensorRef const &ref,
     int lane_id
   ):
     ref_(ref) {

     int quad = (lane_id >> 2);
     int lane_in_quad = (lane_id & 3);
     int accum_m, accum_n;

     if (platform::is_same<Element, float>::value) {
       // (quad[2],quad[0])+lane_in_quad[0]
       accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
       // (quad[1])+lane_in_quad[1]
       accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
                   (lane_in_quad & 2);
     } else {
       accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + lane_in_quad; // (quad[2],quad[0])
       accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
     }
     MatrixCoord lane_offset(accum_m, accum_n);

     ref_.add_coord_offset(lane_offset);
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
     ref_.add_pointer_offset(offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpAccumulatorTileIterator & operator++() {
     // deliberate no-op
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpAccumulatorTileIterator & operator--() {
     // deliberate no-op
     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void load_with_pointer_offset(
     Fragment &frag,
     Index pointer_offset) const {

     TensorRef offset_ref(ref_);
     offset_ref.add_pointer_offset(pointer_offset);

     CUTLASS_PRAGMA_UNROLL
     for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
       CUTLASS_PRAGMA_UNROLL
       for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
         CUTLASS_PRAGMA_UNROLL
         for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
           CUTLASS_PRAGMA_UNROLL
           for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {

             int mma_accum_start =
                 (((tile_n * Policy::TileIterations::kRow + tile_m) *
                     Policy::MmaIterations::kColumn + mma_n) *
                      Policy::MmaIterations::kRow + mma_m) *
                     kElementsPerMma;

            CUTLASS_PRAGMA_UNROLL
             for (int p = 0; p < kAccumulatorPatials; ++p) {
               CUTLASS_PRAGMA_UNROLL
               for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
                 CUTLASS_PRAGMA_UNROLL
                 for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
                   int accum_m = tile_m * Policy::InterleavedTile::kRow +
                                 mma_m * QuadShapePerPatialMma::kRow + m * 2;
                   int accum_n = tile_n * Policy::InterleavedTile::kColumn +
                                 mma_n * QuadShapePerPatialMma::kColumn +
                                 p * Policy::InterleavedTile::kColumn/2 + n;
                   int idx = mma_accum_start + p * kElementsPerPartial +
                             m * EleShapePerPatial::kColumn + n;
                 frag[idx] = offset_ref.at({accum_m, accum_n});
                 }
               }
             }
           }
         }
       }
     }
   }
   CUTLASS_DEVICE
   void load_with_byte_offset(
     Fragment &frag,
     Index byte_offset) const {

     load_with_pointer_offset(byte_offset / sizeof(Element));
   }

   CUTLASS_HOST_DEVICE
   void load(
     Fragment &frag,
     TensorCoord const &tile_offset) const {

     load(frag, tile_offset, 0);
   }

   CUTLASS_HOST_DEVICE
   void load(
     Fragment &frag,
     TensorCoord const &tile_offset,
     Index pointer_offset) const {

     load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) const {
     store_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_pointer_offset(
     Fragment const &frag,
     Index pointer_offset) const {

     TensorRef offset_ref(ref_);
     offset_ref.add_pointer_offset(pointer_offset);

     CUTLASS_PRAGMA_UNROLL
     for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
       CUTLASS_PRAGMA_UNROLL
       for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
         CUTLASS_PRAGMA_UNROLL
         for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
           CUTLASS_PRAGMA_UNROLL
           for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {

             int mma_accum_start =
                 (((tile_n * Policy::TileIterations::kRow + tile_m) *
                     Policy::MmaIterations::kColumn + mma_n) *
                      Policy::MmaIterations::kRow + mma_m) *
                     kElementsPerMma;

             CUTLASS_PRAGMA_UNROLL
             for (int p = 0; p < kAccumulatorPatials; ++p) {
               CUTLASS_PRAGMA_UNROLL
               for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
                 CUTLASS_PRAGMA_UNROLL
                 for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
                   int accum_m = tile_m * Policy::InterleavedTile::kRow +
                                 mma_m * QuadShapePerPatialMma::kRow + m * 2;
                   int accum_n = tile_n * Policy::InterleavedTile::kColumn +
                                 mma_n * QuadShapePerPatialMma::kColumn +
                                 p * Policy::InterleavedTile::kColumn/2 + n;
                   int idx = mma_accum_start + p * kElementsPerPartial +
                             m * EleShapePerPatial::kColumn + n;
                   offset_ref.at({accum_m, accum_n}) = frag[idx];
                 }
               }
             }
           }
         }
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void store_with_byte_offset(
     Fragment const &frag,
     Index byte_offset) const {

     store_with_pointer_offset(byte_offset / sizeof(Element));
   }

   CUTLASS_HOST_DEVICE
   void store(
     Fragment &frag,
     TensorCoord const &tile_offset) const {

     store(frag, tile_offset, 0);
   }

   CUTLASS_HOST_DEVICE
   void store(
       Fragment const &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
   }
 };

 template <
     typename Shape_,
     Operand Operand_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_,
     int KBlock>
 class MmaVoltaTensorOpMultiplicandTileIterator<
     Shape_, Operand_, Element_,
     cutlass::layout::VoltaTensorOpMultiplicandCrosswise<
         sizeof_bits<Element_>::value, KBlock>,
     InstructionShape_, OpDelta_, 32> {
  public:
   using Shape = Shape_;

   static Operand const kOperand = Operand_;

   static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
                 "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for "
                 "A or B operands to warp-level Mma.");

   using Element = Element_;

   static int const kKBlock = KBlock;

   using Layout = cutlass::layout::VoltaTensorOpMultiplicandCrosswise<
       sizeof_bits<Element_>::value, kKBlock>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   struct Policy {

     using LdsShape = layout::PitchLinearShape<1, 32>;

     using LdsIterations = layout::PitchLinearShape<1, Shape::kStrided / 32>;

     static int const kElementsPerAccess = 8;

     static int const kContiguousElementsPerLine = 4;
   };

  private:
   static_assert(kOpDelta == 1,
                 "Alternative arrangements not supported at present.");

   using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;

  public:
   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads * 2>;

  private:

   Index stride_;

   AccessType const *pointer_;

   Index byte_offset_;

   Index line_size;

   int k_group_idx_;

  public:
   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator()
       : pointer_(nullptr),
         stride_(0),
         line_size(0),
         byte_offset_(0),
         k_group_idx_(0) {}

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
       : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
         stride_(ref.stride(0) * Policy::kElementsPerAccess),
         line_size((ref.stride(0) * Policy::kContiguousElementsPerLine) /
                   Policy::kElementsPerAccess),
         k_group_idx_(0),
         byte_offset_(0) {

     int quad = (lane_id / 4);
     int lane_in_quad = (lane_id % 4);
     int access_contiguous;

     if(kOperand == Operand::kA) {

       // swizzle id: tid[4]|tid[1:0]|(tid[2]^tid[4])
       access_contiguous = ((quad & 0x4) << 1) + ((lane_in_quad) << 1) +
                             ((quad & 0x1) ^ ((quad & 0x4) >> 2));
     } else {

       // swizzle id: tid[4]|tid[1:0]|tid[3]
       access_contiguous = ((quad & 0x4) << 1) + (lane_in_quad << 1) +
                             ((quad & 0x2) >> 1 ^ ((quad & 0x4) >> 2));
     }

     byte_offset_ = access_contiguous *
                    sizeof(Element) * Policy::kElementsPerAccess;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
     byte_offset_ += offset * sizeof(Element);

     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(
       TensorCoord const &tile_offset) {

     int contiguous_offset = tile_offset.contiguous();
     int strided_offset = tile_offset.strided();
     k_group_idx_ = 0;

     pointer_ += contiguous_offset *
                     (InstructionShape::kContiguous /
                      Policy::kContiguousElementsPerLine) *
                     line_size +
                 strided_offset * Shape::kStrided / 2;
     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &operator++() {
     k_group_idx_ = (k_group_idx_ + 1) % 8;

     if (k_group_idx_ == 4 || k_group_idx_ == 0) {
       byte_offset_ ^= 1 * sizeof(Element) * Policy::kElementsPerAccess;
     }

     pointer_ += line_size;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &operator--() { assert(0); }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &operator+=(
       TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &operator-=(
       TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {

     AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {

       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {

         int access_idx = c + s * Policy::LdsIterations::kContiguous;

         AccessType const *source_ptr = pointer_ +
           Policy::LdsShape::kContiguous * c * line_size +
           Policy::LdsShape::kStrided * s / 2;

         char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
         fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));

         // swap higher 64bit and lower 64bit
         if (k_group_idx_ &  0x2) {
             uint64_t *low = reinterpret_cast<uint64_t *>(&frag) + access_idx * 2;
             uint64_t *high = reinterpret_cast<uint64_t *>(&frag) + access_idx * 2 + 1;
             uint64_t tmp = *low;
             *low = *high;
             *high = tmp;
         }
       }
     }
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     load_with_byte_offset(frag, pointer_offset * sizeof(Element));
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     load_with_byte_offset(frag, tile_offset, 0);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     Index pointer_offset = tile_offset.contiguous() *
                                InstructionShape::kContiguous /
                                Policy::kElementsPerAccess +
                            tile_offset.strided() * Shape::kStrided * stride_;

     byte_offset += sizeof(AccessType) * pointer_offset;

     load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     k_group_idx_ = k_group;
   }
 };

 template <
     typename Shape_,
     Operand Operand_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_,
     int KBlock>
 class MmaVoltaTensorOpMultiplicandTileIterator<
     Shape_, Operand_, Element_,
     cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
         sizeof_bits<Element_>::value, KBlock>,
     InstructionShape_, OpDelta_, 32> {
  public:
   using Shape = Shape_;

   static Operand const kOperand = Operand_;

   static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
                 "MmaTensorOpMultiplicandIterator may only be instantiated for "
                 "A or B operands to warp-level Mma.");

   using Element = Element_;

   static int const kKBlock = KBlock;


   using Layout = cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
       sizeof_bits<Element_>::value, kKBlock>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   using Base = MmaVoltaTensorOpMultiplicandTileIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
       layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                                  kKBlock>,
       layout::PitchLinearShape<InstructionShape::kRow,
                                InstructionShape::kColumn>,
       kOpDelta, kThreads>;

  public:
   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads * 2>;

  private:
   Base iterator_;

  public:
   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator() {}

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
       : iterator_({ref.data(), ref.stride()}, lane_id) {}

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
     iterator_.add_pointer_offset(offset);

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(
       TensorCoord const &tile_offset) {
     iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &operator++() {
     ++iterator_;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &operator--() {
     --iterator_;

     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &operator+=(
       TensorCoord const &tile_offset) {
     add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &operator-=(
       TensorCoord const &tile_offset) {
     add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const { iterator_.load(frag); }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     // TODO
     assert(0);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     // TODO
     assert(0);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(
         frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     iterator_.set_kgroup_index(k_group);
   }
 };


 template <
     typename Shape_,
     Operand Operand_,
     typename Element_,
     typename InstructionShape_,
     int OpDelta_,
     int KBlock>
 class MmaVoltaTensorOpMultiplicandTileIterator<
     Shape_, Operand_, Element_,
     cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
         sizeof_bits<Element_>::value, KBlock>,
     InstructionShape_, OpDelta_, 32> {
  public:
   using Shape = Shape_;

   static Operand const kOperand = Operand_;

   static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
                 "MmaTensorOpMultiplicandIterator may only be instantiated for "
                 "A or B operands to warp-level Mma.");

   using Element = Element_;

   static int const kKBlock = KBlock;

   using Layout = cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
       sizeof_bits<Element_>::value, kKBlock>;

   using InstructionShape = InstructionShape_;

   static int const kOpDelta = OpDelta_;

   static int const kThreads = 32;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   using Base = MmaVoltaTensorOpMultiplicandTileIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
       layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                                  kKBlock>,
       layout::PitchLinearShape<InstructionShape::kColumn,
                                InstructionShape::kRow>,
       kOpDelta, kThreads>;

  public:
   //
   // Derived quantities
   //

   using Fragment = Array<Element, Shape::kCount / kThreads * 2>;

  private:
   Base iterator_;

  public:
   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator() {}

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
       : iterator_({ref.data(), ref.stride()}, lane_id) {}

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
     iterator_.add_pointer_offset(offset);

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(
       TensorCoord const &tile_offset) {
     iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &operator++() {
     ++iterator_;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &operator--() {
     --iterator_;

     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &operator+=(
       TensorCoord const &tile_offset) {
     add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
     return *this;
   }

   CUTLASS_DEVICE
   MmaVoltaTensorOpMultiplicandTileIterator &operator-=(
       TensorCoord const &tile_offset) {
     add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const { iterator_.load(frag); }

   CUTLASS_DEVICE
   void load_with_pointer_offset(
       Fragment &frag,
       Index pointer_offset) const {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(frag, byte_offset);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset) const {
     // TODO
     assert(0);
   }

   CUTLASS_DEVICE
   void load(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index pointer_offset) const {
     // TODO
     assert(0);
   }

   CUTLASS_DEVICE
   void load_with_byte_offset(
       Fragment &frag,
       TensorCoord const &tile_offset,
       Index byte_offset) const {
     iterator_.load_with_byte_offset(
         frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     iterator_.set_kgroup_index(k_group);
   }
 };

 } // namespace warp
 } // namespace gemm
 } // namespace cutlass

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::load
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1369

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:2194

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:630

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator-=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1031

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator++
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:254

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:353

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:189

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator++
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:539

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:127

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1166

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:2077

cutlass::MatrixShape
Describes the size of a matrix tile.
Definition: matrix_shape.h:42

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:821

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:2064

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1151

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:2044

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator
Definition: mma_tensor_op_tile_iterator_sm70.h:70

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:436

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:626

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator-=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:279

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Stores a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1450

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1045

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Fragment
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:965

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:945

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:2170

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::operator+=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1925

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:1782

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:980

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:606

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::OpDelta
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator_sm70.h:1154

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Fragment
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:1560

cutlass::TensorRef::data
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254

cutlass::platform::is_same
std::is_same (false specialization)
Definition: platform.h:394

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::MmaVoltaTensorOpAccumulatorTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1224

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:694

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:121

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::store_with_pointer_offset
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1385

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator+=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:1024

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:185

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:1108

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1299

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Element
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:415

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::add_pointer_offset
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:219

cutlass::make_Coord
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:387

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1725

cutlass::gemm::Operand
Operand
GEMM operand enumeration: D = A * B + C.
Definition: include/cutlass/gemm/gemm.h:39

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1881

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Element
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1499

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Element
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:924

cutlass::AlignedArray
Aligned array type.
Definition: array.h:511

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::operator--
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1278

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:439

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::load
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1360

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:884

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator++
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:782

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:1855

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::add_tile_offset
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1262

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1584

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1522

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm70.h:848

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:2174

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Element
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1825

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1087

platform.h
C++ features that may be otherwise unimplemented for CUDA device functions.

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::MmaVoltaTensorOpAccumulatorTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:1228

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1956

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:1169

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1688

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::operator++
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1271

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::Fragment
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:1213

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::add_tile_offset
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:523

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::Shape
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1139

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:286

cutlass::TensorRef::add_coord_offset
CUTLASS_HOST_DEVICE TensorRef & add_coord_offset(TensorCoord const &coord)
Adds an offset to each pointer.
Definition: tensor_ref.h:326

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::add_pointer_offset
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:1255

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:1379

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator--
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:791

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator-=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:807

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1966

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::gemm::Operand::kC
B multiplicand.

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Fragment
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile, needs on more time number of registers.
Definition: mma_tensor_op_tile_iterator_sm70.h:476

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator++
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1006

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1836

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::operator+=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1667

cutlass::gemm::Operand::kA

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Fragment
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:741

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::Element
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1145

cutlass::TensorRef::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277

cutlass::TensorRef::TensorCoord
typename Layout::TensorCoord TensorCoord
Coordinate in logical tensor space.
Definition: tensor_ref.h:171

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Fragment
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:168

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::operator-=
CUTLASS_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1292

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator--
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:263

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1038

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1852

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::operator-=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1934

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:831

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1815

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1055

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::add_tile_offset
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:228

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:706

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator--
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:548

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::operator-=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:2162

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:616

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:841

nullptr
#define nullptr
nullptr
Definition: platform.h:144

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1684

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:756

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:948

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:331

cutlass::TensorRef< Element, Layout >

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:752

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:293

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:421

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:1885

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1489

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:976

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:1890

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1065

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:409

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::operator-=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1676

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator+=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:800

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:2118

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::add_tile_offset
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1899

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::load_with_pointer_offset
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1305

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1757

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:1528

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:2009

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:2083

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:718

cutlass::layout::VoltaTensorOpMultiplicandBCongruous
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:397

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator-=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:564

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::operator--
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1662

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::add_tile_offset
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:997

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::TensorRef::offset
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator--
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1015

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:863

cutlass::platform::conditional
std::conditional (true specialization)
Definition: platform.h:325

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:497

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:764

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::operator--
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:2144

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::store
CUTLASS_HOST_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const
Stores a fragment to memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1441

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1509

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::Policy
Internal structure of iterator - made public to enable introspection.
Definition: mma_tensor_op_tile_iterator_sm70.h:1172

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:321

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:638

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:124

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::add_tile_offset
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1632

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::add_pointer_offset
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:514

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:2205

cutlass::layout::VoltaTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm70.h:733

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:988

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm70.h:943

cutlass::TensorRef::Index
typename Layout::Index Index
Index type.
Definition: tensor_ref.h:165

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:942

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1163

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1351

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:341

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:851

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:571

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Fragment
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:1872

cutlass::TensorRef::at
CUTLASS_HOST_DEVICE Reference at(TensorCoord const &coord) const
Returns a reference to the element at a given Coord.
Definition: tensor_ref.h:307

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:2184

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator
Definition: mma_tensor_op_tile_iterator_sm70.h:1135

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:578

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::add_pointer_offset
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:1623

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1735

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:915

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1977

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Fragment
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:2100

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::store_with_byte_offset
CUTLASS_HOST_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1432

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Element
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:2054

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:2109

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1745

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Shape
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:97

pitch_linear.h
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:814

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::add_tile_offset
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:773

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::operator+=
CUTLASS_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:1285

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1942

cutlass::TensorRef::add_pointer_offset
CUTLASS_HOST_DEVICE TensorRef & add_pointer_offset(LongIndex offset_)
Adds an offset to each pointer.
Definition: tensor_ref.h:319

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator+=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:272

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1849

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:191

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1990

tensor_op_multiplicand_sm70.h

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1946

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:663

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:2113

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:2237

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Element
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:103

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::operator+=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:557

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:930

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::operator++
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:2136

cutlass::gemm::Operand::kB
A multiplicand.

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::operator++
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1649

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::load
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1075

cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator::Layout
Layout_ Layout
Layout of source tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:1148

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::operator--
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1916

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::operator+=
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:2153

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::TensorRef
TensorRef< Element, Layout > TensorRef
TensorRef type for loading element from a tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:1519

cutlass::layout::VoltaTensorOpMultiplicandCongruous
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:60

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:724

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::operator++
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1908

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:721

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Element
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:700

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::load_with_byte_offset
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:2218

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::add_tile_offset
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:2127

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:493

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:378

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kB, Element_, cutlass::layout::VoltaTensorOpMultiplicandBCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:433

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand::kA, Element_, cutlass::layout::VoltaTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value >, InstructionShape_, OpDelta_, 32 >::InstructionShape
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:109

cutlass::TensorRef::LongIndex
typename Layout::LongIndex LongIndex
Long index used for pointer offsets.
Definition: tensor_ref.h:168

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::MmaVoltaTensorOpMultiplicandTileIterator
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:1593

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1525

cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:2080