cutlass/mma__simt__tile__iterator_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/matrix_shape.h"
 #include "cutlass/layout/matrix.h"

 #include "cutlass/gemm/gemm.h"
 #include "cutlass/gemm/warp/mma_simt_policy.h"


 namespace cutlass {
 namespace gemm {
 namespace warp {


 template <
   typename Shape_,
   Operand Operand,
   typename Element_,
   typename Layout_,
   typename Policy_,
   int PartitionsK = 1,
   int PartitionGroupSize = 1
 >
 class MmaSimtTileIterator;


 template <
   typename Shape_,
   typename Element_,
   typename Policy_,
   int PartitionsK,
   int PartitionGroupSize
 >
 class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize> {
 public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kA;

   using Element = Element_;

   using Layout = layout::ColumnMajor;

   using Policy = Policy_;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   //
   // Derived quantities
   //

   static_assert(!(Shape::kRow % Policy::WarpShape::kRow),
     "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");

   static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
   static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
   static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
   static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");

   using ThreadShape = MatrixShape<
     Shape::kRow / Policy::WarpShape::kRow,
     Shape::kColumn
   >;

   static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM),
     "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");

   using Iterations = MatrixShape<
     ThreadShape::kRow / Policy::LaneMmaShape::kM,
     ThreadShape::kColumn
   >;

   using Fragment = Array<Element, ThreadShape::kCount>;

 private:

   cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kM>, layout::ColumnMajor> ref_;

 public:

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator(
     TensorRef ref,
     int lane_id
   ) {

     // compute offset based on thread ID and lane layout
     typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();

     MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
       MatrixCoord(Policy::LaneMmaShape::kM, 0);

     ref.add_coord_offset(lane_offset);

     ref_.reset(
       reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(ref.data()),
       ref.stride(0) / Policy::LaneMmaShape::kM);
   }


   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
     ref_.add_pointer_offset(offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {

     ref_.add_coord_offset({
       coord.row() * Shape::kRow / Policy::LaneMmaShape::kM,
       coord.column() * Shape::kColumn});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator & operator++() {

     ref_.add_coord_offset({0, Shape::kColumn});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator & operator--() {

     ref_.add_coord_offset({0, -Shape::kColumn});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
     Array<Element, Policy::LaneMmaShape::kM> *dst_ptr =
       reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Iterations::kColumn; ++k) {
       CUTLASS_PRAGMA_UNROLL
       for (int m = 0; m < Iterations::kRow; ++m) {
         dst_ptr[m + k * Iterations::kRow] =
           *(ref_.data() + ref_.offset({m * Policy::WarpShape::kRow, k}) + pointer_offset / Policy::LaneMmaShape::kM);
       }
     }
   }
   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {

     Array<Element, Policy::LaneMmaShape::kM> const *src_ptr =
       reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Iterations::kN; ++k) {
       CUTLASS_PRAGMA_UNROLL
       for (int m = 0; m < Iterations::kM; ++m) {
         *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM, k) + pointer_offset / Policy::LaneMmaShape::kM) =
           src_ptr[m + k * Iterations::kM];
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) const {
     store_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     // no operation here
   }
 };


 template <
   typename Shape_,
   typename Element_,
   typename Policy_,
   int PartitionsK,
   int PartitionGroupSize
 >
 class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize> {
 public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kB;

   using Element = Element_;

   using Layout = layout::RowMajor;

   using Policy = Policy_;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   //
   // Derived quantities
   //

   static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn),
     "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension.");

   static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
   static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
   static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
   static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");

   using ThreadShape = MatrixShape<
     Shape::kRow,
     Shape::kColumn / Policy::WarpShape::kColumn
   >;

   static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN),
     "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");

   using Iterations = MatrixShape<
     ThreadShape::kRow,
     ThreadShape::kColumn / Policy::LaneMmaShape::kN
   >;

   using Fragment = Array<Element, ThreadShape::kCount>;

 private:

   cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;


 public:

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator(
     TensorRef ref,
     int lane_id
   ) {

     // compute offset based on thread ID and lane layout
     typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();

     MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
       MatrixCoord(0, Policy::LaneMmaShape::kN);

     ref.add_coord_offset(lane_offset);

     ref_.reset(
       reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(ref.data()),
       ref.stride(0) / Policy::LaneMmaShape::kN);
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
     ref_.add_pointer_offset(offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {

     ref_.add_coord_offset({
       coord.row() * Shape::kRow,
       coord.column() * Shape::kColumn / Policy::LaneMmaShape::kN});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator & operator++() {

     ref_.add_coord_offset({Shape::kRow, 0});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator & operator--() {

     ref_.add_coord_offset({-Shape::kRow, 0});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {

     Array<Element, Policy::LaneMmaShape::kN> *dst_ptr =
       reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Iterations::kRow; ++k) {
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < Iterations::kColumn; ++n) {
         dst_ptr[n + k * Iterations::kColumn] =
           *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kColumn}) + pointer_offset / Policy::LaneMmaShape::kN);
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {

     Array<Element, Policy::LaneMmaShape::kN> const *src_ptr =
       reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Iterations::kM; ++k) {
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < Iterations::kN; ++n) {
         *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) =
           src_ptr[n + k * Iterations::kN];
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag, Index pointer_offset) const {
     store_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     // no operation here
   }
 };


 template <
   typename Shape_,
   typename Element_,
   typename Policy_
 >
 class MmaSimtTileIterator<Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_> {
 public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kC;

   using Element = Element_;

   using Layout = layout::ColumnMajor;

   using Policy = Policy_;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   //
   // Derived quantities
   //

   static_assert(
     (!(Shape::kRow % Policy::WarpShape::kRow)) && (!(Shape::kColumn % Policy::WarpShape::kColumn)),
     "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");

   static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
   static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
   static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
   static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
   static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
   static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");

   using ThreadShape = MatrixShape<
     Shape::kRow / Policy::WarpShape::kRow,
     Shape::kColumn / Policy::WarpShape::kColumn
   >;

   static_assert(
     (!(ThreadShape::kRow % Policy::LaneMmaShape::kM)) && (!(ThreadShape::kColumn % Policy::LaneMmaShape::kN)),
     "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");

   using Iterations = MatrixShape<
     ThreadShape::kRow / Policy::LaneMmaShape::kM,
     ThreadShape::kColumn / Policy::LaneMmaShape::kN
   >;

   using Delta = MatrixShape<
     Policy::WarpShape::kRow * Policy::LaneMmaShape::kM,
     Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN
   >;

   using Fragment = Array<Element, ThreadShape::kCount>;

 private:

   TensorRef ref_;

 public:

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator(
     TensorRef const &ref,
     int lane_id
   ):
     ref_(ref) {

     // compute offset based on thread ID and lane layout
     typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();

     MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
       MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);

     ref_.add_coord_offset(lane_offset);
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
     ref_.add_pointer_offset(offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {

     ref_.add_coord_offset({
       coord.row() * Shape::kRow,
       coord.column() * Shape::kColumn});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator & operator++() {

     ref_.add_coord_offset({Shape::kRow, 0});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator & operator--() {

     ref_.add_coord_offset({-Shape::kRow, 0});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load_with_pointer_offset(
     Fragment &frag,
     Index pointer_offset) const {

     CUTLASS_PRAGMA_UNROLL
     for (int mma_n = 0; mma_n < Iterations::kN; ++mma_n) {
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {

         Array<Element, Policy::LaneMmaShape::kM> const *src_ptr =
           reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> const *>(
             ref_.data() + pointer_offset + ref_.offset({0, mma_n * Delta::kN + n}));

         CUTLASS_PRAGMA_UNROLL
         for (int mma_m = 0; mma_m < Iterations::kM; ++mma_m) {

           Array<Element, Policy::LaneMmaShape::kM> *dst_ptr =
             reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(&frag) +
             mma_m + Iterations::kM * (n + mma_n * Policy::LaneMmaShape::kN);

           *dst_ptr = src_ptr[mma_m * Policy::WarpShape::kM];
         }
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {

     CUTLASS_PRAGMA_UNROLL
     for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {

         Array<Element, Policy::LaneMmaShape::kM> *dst_ptr=
           reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(
             ref_.data() + pointer_offset + ref_.offset({0, mma_n * Delta::kColumn + n}));

         CUTLASS_PRAGMA_UNROLL
         for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {

           Array<Element, Policy::LaneMmaShape::kM> const *src_ptr =
             reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> const *>(&frag) +
             mma_m + Iterations::kRow * (n + mma_n * Policy::LaneMmaShape::kN);

           dst_ptr[mma_m * Policy::WarpShape::kRow] = *src_ptr;
         }
       }
     }
   }
   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) const {
     store_with_pointer_offset(frag, 0);
   }
 };


 template <
   typename Shape_,
   typename Element_,
   typename Policy_
 >
 class MmaSimtTileIterator<Shape_, Operand::kC, Element_, layout::RowMajor, Policy_> {
 public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kC;

   using Element = Element_;

   using Layout = layout::RowMajor;

   using Policy = Policy_;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   //
   // Derived quantities
   //

   static_assert(
     (!(Shape::kRow % Policy::WarpShape::kRow)) && (!(Shape::kColumn % Policy::WarpShape::kColumn)),
     "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");

   static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
   static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
   static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
   static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
   static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
   static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");

   using ThreadShape = MatrixShape<
     Shape::kRow / Policy::WarpShape::kRow,
     Shape::kColumn / Policy::WarpShape::kColumn
   >;

   static_assert(
     (!(ThreadShape::kRow % Policy::LaneMmaShape::kM)) && (!(ThreadShape::kColumn % Policy::LaneMmaShape::kN)),
     "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");

   using Iterations = MatrixShape<
     ThreadShape::kRow / Policy::LaneMmaShape::kM,
     ThreadShape::kColumn / Policy::LaneMmaShape::kN
   >;

   using Delta = MatrixShape<
     Policy::WarpShape::kRow * Policy::LaneMmaShape::kM,
     Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN
   >;

   using Fragment = Array<Element, ThreadShape::kCount>;

 private:

   TensorRef ref_;

 public:

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator(
     TensorRef const &ref,
     int lane_id
   ):
     ref_(ref) {

     // compute offset based on thread ID and lane layout
     typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();

     MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
       MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);

     ref_.add_coord_offset(lane_offset);
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
     ref_.add_pointer_offset(offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {

     ref_.add_coord_offset({
       coord.row() * Shape::kRow,
       coord.column() * Shape::kColumn});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator & operator++() {

     ref_.add_coord_offset({Shape::kRow, 0});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator & operator--() {

     ref_.add_coord_offset({-Shape::kRow, 0});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load_with_pointer_offset(
     Fragment &frag,
     Index pointer_offset) const {

     CUTLASS_PRAGMA_UNROLL
     for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
       CUTLASS_PRAGMA_UNROLL
       for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {

         Array<Element, Policy::LaneMmaShape::kN> const *src_ptr =
           reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> const *>(
             ref_.data() + pointer_offset + ref_.offset({mma_m * Delta::kRow + m, 0}));

         CUTLASS_PRAGMA_UNROLL
         for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {

           Array<Element, Policy::LaneMmaShape::kN> *dst_ptr =
             reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag) +
             mma_n + Iterations::kColumn * (m + mma_m * Policy::LaneMmaShape::kM);

           *dst_ptr = src_ptr[mma_n * Policy::WarpShape::kColumn];
         }
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {

     CUTLASS_PRAGMA_UNROLL
     for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
       CUTLASS_PRAGMA_UNROLL
       for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {

         Array<Element, Policy::LaneMmaShape::kN> *dst_ptr =
           reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(
             ref_.data() + pointer_offset + ref_.offset({mma_m * Delta::kRow + m, 0}));

         CUTLASS_PRAGMA_UNROLL
         for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {

           Array<Element, Policy::LaneMmaShape::kN> const *src_ptr =
             reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> const *>(&frag) +
             mma_n + Iterations::kColumn * (m + mma_m * Policy::LaneMmaShape::kM);

           dst_ptr[mma_n * Policy::WarpShape::kColumn] = *src_ptr;
         }
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) const {
     store_with_pointer_offset(frag, 0);
   }
 };


 template <
   typename Shape_,
   typename Element_,
   typename Policy_,
   int PartitionsK,
   int PartitionGroupSize
 >
 class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved<4>, Policy_, PartitionsK, PartitionGroupSize> {
 public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kA;

   using Element = Element_;

   using Layout = layout::ColumnMajorInterleaved<4> ;

   using Policy = Policy_;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   static const int kInterleave = 4;

   static const int kPartitionsK = PartitionsK;

   static const int kGroupPerTile = PartitionGroupSize / Shape::kColumn;

   //
   // Derived quantities
   //

   static_assert(!(Shape::kRow % Policy::WarpShape::kRow),
     "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");

   static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
   static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
   static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
   static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");

   using ThreadShape = MatrixShape<
     Shape::kRow / Policy::WarpShape::kRow,
     Shape::kColumn
   >;

   static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM) && !(ThreadShape::kColumn % Policy::LaneMmaShape::kK),
     "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");

   using Iterations = MatrixShape<
     ThreadShape::kRow / Policy::LaneMmaShape::kM,
     ThreadShape::kColumn / Policy::LaneMmaShape::kK
   >;

   using Fragment = Array<Element, ThreadShape::kCount>;

 private:

   cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kMK>, layout::ColumnMajorInterleaved<4>> ref_;

   int k_group_idx_;

 public:
   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator(
     TensorRef ref,
     int lane_id
   ) {

     // compute offset based on thread ID and lane layout
     typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();

     MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
       MatrixCoord(Policy::LaneMmaShape::kM, 0);

     ref.add_coord_offset(lane_offset);

     k_group_idx_ = 0;
     ref_.reset(reinterpret_cast<Array<Element, Policy::LaneMmaShape::kMK> *>(ref.data()), ref.stride(0)/Policy::LaneMmaShape::kMK);
   }


   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
     ref_.add_pointer_offset(offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {

     ref_.add_coord_offset({
       coord.row() * Shape::kRow / Policy::LaneMmaShape::kMK,
       coord.column() * Shape::kColumn});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator & operator++() {

     add_tile_offset({0, 1});

     if (kPartitionsK > 1) {
       ++k_group_idx_;
       // Jump to next stage
       if (k_group_idx_ == kGroupPerTile) {
         k_group_idx_ = 0;
         add_tile_offset({0, kGroupPerTile * (kPartitionsK-1)});
       }
     }

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator & operator--() {

     ref_.add_coord_offset({0, -Shape::kColumn});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {

     Array<Element, Policy::LaneMmaShape::kMK > *dst_ptr =
       reinterpret_cast<Array<Element, Policy::LaneMmaShape::kMK> *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Iterations::kColumn; ++k) {

       CUTLASS_PRAGMA_UNROLL
       for (int m = 0; m < Iterations::kRow; ++m) {

         dst_ptr[m + k * Iterations::kRow] =
           *((ref_.data() + ref_.offset({m * Policy::WarpShape::kRow / kInterleave,
                   k*Policy::LaneMmaShape::kK}) + pointer_offset / Policy::LaneMmaShape::kM));
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {

     Array<Element, Policy::LaneMmaShape::kMK> const *src_ptr =
       reinterpret_cast<Array<Element, Policy::LaneMmaShape::kMK > *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Iterations::kN; ++k) {
       CUTLASS_PRAGMA_UNROLL
       for (int m = 0; m < Iterations::kM; ++m) {
         *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM, k) + pointer_offset / Policy::LaneMmaShape::kM) =
           src_ptr[m + k * Iterations::kM];
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) const {
     store_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     // no operation here
   }
 };


 template <
   typename Shape_,
   typename Element_,
   typename Policy_,
   int PartitionsK,
   int PartitionGroupSize
 >
 class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::RowMajorInterleaved<4>, Policy_, PartitionsK, PartitionGroupSize> {
 public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kB;

   using Element = Element_;

   using Layout = layout::RowMajorInterleaved<4>;

   using Policy = Policy_;

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   static const int kInterleave = 4;

   static const int kPartitionsK = PartitionsK;

   static const int kGroupPerTile = PartitionGroupSize / Shape::kRow;

   //
   // Derived quantities
   //

   static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn),
     "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension.");

   static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
   static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
   static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
   static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");

   using ThreadShape = MatrixShape<
     Shape::kRow,
     Shape::kColumn / Policy::WarpShape::kColumn
   >;

   static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN) && !(ThreadShape::kRow % Policy::LaneMmaShape::kK),
     "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");

   using Iterations = MatrixShape<
     ThreadShape::kRow / Policy::LaneMmaShape::kK,
     ThreadShape::kColumn / Policy::LaneMmaShape::kN
   >;

   using Fragment = Array<Element, ThreadShape::kCount>;


 private:

   cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kKN>, layout::RowMajorInterleaved<4>> ref_;

   int k_group_idx_;

 public:

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator() { }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator(
     TensorRef ref,
     int lane_id
   ) {

     // compute offset based on thread ID and lane layout
     typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();

     MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
       MatrixCoord(0, Policy::LaneMmaShape::kN);

     ref.add_coord_offset(lane_offset);

     k_group_idx_ = 0;

     ref_.reset(
       reinterpret_cast<Array<Element, Policy::LaneMmaShape::kKN> *>(ref.data()),
       ref.stride(0) / Policy::LaneMmaShape::kKN);
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
     ref_.add_pointer_offset(offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {

     ref_.add_coord_offset({
       coord.row() * Shape::kRow,
       coord.column() * Shape::kColumn / Policy::LaneMmaShape::kKN});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator & operator++() {

     add_tile_offset({1, 0});

     if (kPartitionsK > 1) {
       ++k_group_idx_;
       // Jump to next stage
       if (k_group_idx_ == kGroupPerTile) {
         k_group_idx_ = 0;
         add_tile_offset({kGroupPerTile * (kPartitionsK-1), 0});
       }
     }

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaSimtTileIterator & operator--() {

     ref_.add_coord_offset({-Shape::kRow, 0});

     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {

     Array<Element, Policy::LaneMmaShape::kKN> *dst_ptr =
       reinterpret_cast<Array<Element, Policy::LaneMmaShape::kKN> *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Iterations::kRow; ++k) {
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < Iterations::kColumn; ++n) {
         dst_ptr[n + k * Iterations::kColumn] =
           *(ref_.data() + ref_.offset({k * Policy::LaneMmaShape::kK,
                 n * Policy::WarpShape::kColumn / kInterleave}) + pointer_offset / Policy::LaneMmaShape::kN);
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {

     Array<Element, Policy::LaneMmaShape::kN> const *src_ptr =
       reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Iterations::kM; ++k) {
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < Iterations::kN; ++n) {
         *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) =
           src_ptr[n + k * Iterations::kN];
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag, Index pointer_offset) const {
     store_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     // no operation here
   }
 };


 } // namespace warp
 } // namespace gemm
 } // namespace cutlass
cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:257

mma_simt_policy.h
Describes the lane policy used by warp-level matrix multiply operators targeting SIMT instructions...

cutlass::MatrixShape
Describes the size of a matrix tile.
Definition: matrix_shape.h:42

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Fragment
Array< Element, ThreadShape::kCount > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_simt_tile_iterator.h:991

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::operator++
CUTLASS_HOST_DEVICE MmaSimtTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_simt_tile_iterator.h:404

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:686

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::MmaSimtTileIterator
CUTLASS_HOST_DEVICE MmaSimtTileIterator(TensorRef ref, int lane_id)
Constructor from TensorRef.
Definition: mma_simt_tile_iterator.h:1007

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::Policy
Policy_ Policy
Decomposition of elements among threads.
Definition: mma_simt_tile_iterator.h:308

cutlass::TensorRef::data
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::MmaSimtTileIterator
CUTLASS_HOST_DEVICE MmaSimtTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_simt_tile_iterator.h:786

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::MmaSimtTileIterator
CUTLASS_HOST_DEVICE MmaSimtTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_simt_tile_iterator.h:574

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_simt_tile_iterator.h:1356

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::MmaSimtTileIterator
CUTLASS_HOST_DEVICE MmaSimtTileIterator()
Definition: mma_simt_tile_iterator.h:1003

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_simt_tile_iterator.h:730

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_simt_tile_iterator.h:1127

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_simt_tile_iterator.h:521

cutlass::gemm::Operand
Operand
GEMM operand enumeration: D = A * B + C.
Definition: include/cutlass/gemm/gemm.h:39

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_simt_tile_iterator.h:946

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::Policy
Policy_ Policy
Decomposition of elements among threads.
Definition: mma_simt_tile_iterator.h:105

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_simt_tile_iterator.h:317

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Fragment
Array< Element, ThreadShape::kCount > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_simt_tile_iterator.h:1217

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::load_with_pointer_offset
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:1072

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::Fragment
Array< Element, ThreadShape::kCount > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_simt_tile_iterator.h:772

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::operator++
CUTLASS_HOST_DEVICE MmaSimtTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_simt_tile_iterator.h:201

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::MmaSimtTileIterator
CUTLASS_HOST_DEVICE MmaSimtTileIterator(TensorRef ref, int lane_id)
Constructor from TensorRef.
Definition: mma_simt_tile_iterator.h:1236

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::store_with_pointer_offset
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:445

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::MmaSimtTileIterator
CUTLASS_HOST_DEVICE MmaSimtTileIterator()
Default ctor constructs null iterator.
Definition: mma_simt_tile_iterator.h:362

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:655

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaSimtTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_simt_tile_iterator.h:386

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::Element
Element_ Element
Element type.
Definition: mma_simt_tile_iterator.h:302

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:234

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::add_tile_offset
CUTLASS_HOST_DEVICE MmaSimtTileIterator & add_tile_offset(TensorCoord const &coord)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_simt_tile_iterator.h:810

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::MmaSimtTileIterator
CUTLASS_HOST_DEVICE MmaSimtTileIterator()
Default ctor constructs null iterator.
Definition: mma_simt_tile_iterator.h:782

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::MmaSimtTileIterator
CUTLASS_HOST_DEVICE MmaSimtTileIterator(TensorRef ref, int lane_id)
Constructor from TensorRef.
Definition: mma_simt_tile_iterator.h:162

cutlass::TensorRef::add_coord_offset
CUTLASS_HOST_DEVICE TensorRef & add_coord_offset(TensorCoord const &coord)
Adds an offset to each pointer.
Definition: tensor_ref.h:326

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::Shape
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_simt_tile_iterator.h:497

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::load_with_pointer_offset
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:1303

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::gemm::Operand::kC
B multiplicand.

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::store_with_pointer_offset
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:1098

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::gemm::warp::MmaSimtTileIterator
Definition: mma_simt_tile_iterator.h:69

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:439

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::operator--
CUTLASS_HOST_DEVICE MmaSimtTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_simt_tile_iterator.h:618

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::operator--
CUTLASS_HOST_DEVICE MmaSimtTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_simt_tile_iterator.h:210

cutlass::gemm::Operand::kA

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::TensorRef
TensorRef< Element, Layout > TensorRef
TensorRef type for loading element from a tensor.
Definition: mma_simt_tile_iterator.h:724

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Element
Element_ Element
Element type.
Definition: mma_simt_tile_iterator.h:1160

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::MmaSimtTileIterator
CUTLASS_HOST_DEVICE MmaSimtTileIterator()
Default ctor constructs null iterator.
Definition: mma_simt_tile_iterator.h:1232

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaSimtTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_simt_tile_iterator.h:591

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Policy
Policy_ Policy
Decomposition of elements among threads.
Definition: mma_simt_tile_iterator.h:940

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::load_with_pointer_offset
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:219

cutlass::TensorRef::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_simt_tile_iterator.h:114

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::operator++
CUTLASS_HOST_DEVICE MmaSimtTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_simt_tile_iterator.h:1276

cutlass::TensorRef::TensorCoord
typename Layout::TensorCoord TensorCoord
Coordinate in logical tensor space.
Definition: tensor_ref.h:171

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::add_tile_offset
CUTLASS_HOST_DEVICE MmaSimtTileIterator & add_tile_offset(TensorCoord const &coord)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_simt_tile_iterator.h:393

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::Fragment
Array< Element, ThreadShape::kCount > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_simt_tile_iterator.h:560

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_simt_tile_iterator.h:474

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:1092

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaSimtTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_simt_tile_iterator.h:183

cutlass::TensorRef::reset
CUTLASS_HOST_DEVICE void reset(Element *ptr=nullptr)
Updates only the pointer.
Definition: tensor_ref.h:235

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::Fragment
Array< Element, ThreadShape::kCount > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_simt_tile_iterator.h:350

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_simt_tile_iterator.h:515

cutlass::TensorRef< Element, Layout >

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaSimtTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_simt_tile_iterator.h:1027

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_simt_tile_iterator.h:952

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Element
Element_ Element
Element type.
Definition: mma_simt_tile_iterator.h:934

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_simt_tile_iterator.h:518

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_simt_tile_iterator.h:320

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::TensorRef::offset
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::set_kgroup_index
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_simt_tile_iterator.h:269

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::MmaSimtTileIterator
CUTLASS_HOST_DEVICE MmaSimtTileIterator(TensorRef ref, int lane_id)
Constructor from TensorRef.
Definition: mma_simt_tile_iterator.h:366

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:462

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::operator--
CUTLASS_HOST_DEVICE MmaSimtTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_simt_tile_iterator.h:1063

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::store_with_pointer_offset
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:240

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Shape
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_simt_tile_iterator.h:928

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::store_with_pointer_offset
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:661

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::Element
Element_ Element
Element type.
Definition: mma_simt_tile_iterator.h:99

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::Policy
Policy_ Policy
Decomposition of elements among threads.
Definition: mma_simt_tile_iterator.h:509

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::load_with_pointer_offset
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_simt_tile_iterator.h:627

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::add_tile_offset
CUTLASS_HOST_DEVICE MmaSimtTileIterator & add_tile_offset(TensorCoord const &coord)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_simt_tile_iterator.h:1034

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::operator++
CUTLASS_HOST_DEVICE MmaSimtTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_simt_tile_iterator.h:1045

cutlass::TensorRef::Index
typename Layout::Index Index
Index type.
Definition: tensor_ref.h:165

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::store_with_pointer_offset
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:873

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaSimtTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_simt_tile_iterator.h:1258

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::load_with_pointer_offset
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:422

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Policy
Policy_ Policy
Decomposition of elements among threads.
Definition: mma_simt_tile_iterator.h:1166

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:867

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_simt_tile_iterator.h:1172

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_simt_tile_iterator.h:111

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_simt_tile_iterator.h:733

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_simt_tile_iterator.h:1175

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:899

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::load_with_pointer_offset
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_simt_tile_iterator.h:839

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_simt_tile_iterator.h:314

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::operator--
CUTLASS_HOST_DEVICE MmaSimtTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_simt_tile_iterator.h:413

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize >::Shape
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_simt_tile_iterator.h:296

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::operator++
CUTLASS_HOST_DEVICE MmaSimtTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_simt_tile_iterator.h:821

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::MmaSimtTileIterator
CUTLASS_HOST_DEVICE MmaSimtTileIterator()
Default ctor constructs null iterator.
Definition: mma_simt_tile_iterator.h:570

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::Fragment
Array< Element, ThreadShape::kCount > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_simt_tile_iterator.h:147

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::add_tile_offset
CUTLASS_HOST_DEVICE MmaSimtTileIterator & add_tile_offset(TensorCoord const &coord)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_simt_tile_iterator.h:190

cutlass::layout::ColumnMajorInterleaved
Definition: layout/matrix.h:343

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:1321

cutlass::TensorRef::add_pointer_offset
CUTLASS_HOST_DEVICE TensorRef & add_pointer_offset(LongIndex offset_)
Adds an offset to each pointer.
Definition: tensor_ref.h:319

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::operator--
CUTLASS_HOST_DEVICE MmaSimtTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_simt_tile_iterator.h:1294

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::add_pointer_offset
CUTLASS_HOST_DEVICE MmaSimtTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_simt_tile_iterator.h:803

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::MmaSimtTileIterator
CUTLASS_HOST_DEVICE MmaSimtTileIterator()
Default ctor constructs null iterator.
Definition: mma_simt_tile_iterator.h:158

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::operator--
CUTLASS_HOST_DEVICE MmaSimtTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_simt_tile_iterator.h:830

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::add_tile_offset
CUTLASS_HOST_DEVICE MmaSimtTileIterator & add_tile_offset(TensorCoord const &coord)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_simt_tile_iterator.h:598

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_simt_tile_iterator.h:1178

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:1115

cutlass::gemm::Operand::kB
A multiplicand.

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::Index
typename TensorRef::Index Index
Index type.
Definition: mma_simt_tile_iterator.h:727

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Shape
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_simt_tile_iterator.h:1154

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::TensorCoord
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_simt_tile_iterator.h:117

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::operator++
CUTLASS_HOST_DEVICE MmaSimtTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_simt_tile_iterator.h:609

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize >::Shape
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_simt_tile_iterator.h:93

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::add_tile_offset
CUTLASS_HOST_DEVICE MmaSimtTileIterator & add_tile_offset(TensorCoord const &coord)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_simt_tile_iterator.h:1265

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:1344

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::store_with_pointer_offset
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory at the location pointed to by the iterator.
Definition: mma_simt_tile_iterator.h:1327

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::LongIndex
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_simt_tile_iterator.h:949

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::Element
Element_ Element
Element type.
Definition: mma_simt_tile_iterator.h:715

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_ >::Element
Element_ Element
Element type.
Definition: mma_simt_tile_iterator.h:503

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::Policy
Policy_ Policy
Decomposition of elements among threads.
Definition: mma_simt_tile_iterator.h:721

cutlass::TensorRef::LongIndex
typename Layout::LongIndex LongIndex
Long index used for pointer offsets.
Definition: tensor_ref.h:168

cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kC, Element_, layout::RowMajor, Policy_ >::Shape
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_simt_tile_iterator.h:709

cutlass::layout::RowMajorInterleaved
Definition: layout/matrix.h:237