cutlass/default__mma__core__simt_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
 #include "cutlass/fast_math.h"

 #include "cutlass/numeric_types.h"
 #include "cutlass/matrix_shape.h"


 #include "cutlass/transform/pitch_linear_thread_map.h"
 #include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
 #include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h"

 #include "cutlass/gemm/warp/mma_simt_policy.h"
 #include "cutlass/gemm/warp/mma_simt.h"
 #include "cutlass/gemm/threadblock/default_mma_core.h"


 namespace cutlass {
 namespace gemm {
 namespace threadblock {

 namespace detail {

 // convert a WarpShape which is the whole tile of elements into warp num threads.
 // The goal is for each thread's tile of elements to be as square as possible
 // for performance (4x4 will be faster than 2x8).
 template<typename WarpShape>
 constexpr int simt_get_warp_threads_m() {
     return (WarpShape::kM > WarpShape::kN) ? 8 : 4;
 }

 constexpr int simt_transpose_padding(int threads, int crosswise, int size_in_bits) {
   return (size_in_bits >= 32 ?
       threads / crosswise / (size_in_bits / 32) :
       threads / crosswise * (32 / size_in_bits)
   );
 }

 }


 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
                       layout::ColumnMajor, ElementB_, layout::RowMajor,
                       ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
                      > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = GemmShape<1, 1, 1>;
   using ElementA = ElementA_;
   using LayoutA = layout::ColumnMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::RowMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassSimt;
   static int const PartitionsK = Shape::kK / WarpShape::kK;

   using Operator = Operator_;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     PartitionsK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kElementsPerAccess = 1;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::ColumnMajor;
   using SmemLayoutB = layout::RowMajor;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
     layout::PitchLinearShape<Shape::kM, Shape::kK>,
     kThreads,
     kElementsPerAccess
   >;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     1,
     IteratorThreadMapA
   >;

   using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
     layout::PitchLinearShape<Shape::kN, Shape::kK>,
     kThreads,
     kElementsPerAccess
   >;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     IteratorThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level op
   static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
   static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
   static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
   static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
   static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
   static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
   static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
   static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
   static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
   // these should have max of thread tile also
   using LaneMmaShape = cutlass::gemm::GemmShape<
       LaneM,
       LaneN,
       1>;
   using Policy = cutlass::gemm::warp::MmaSimtPolicy<
       cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
       cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
       LaneMmaShape
   >;

   using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
     WarpShape,
     ElementA,
     SmemLayoutA,
     ElementB,
     SmemLayoutB,
     ElementC,
     LayoutC,
     Policy
     >;

   using MmaPolicy = MmaPolicy<
     MmaWarpSimt,
     MatrixShape<0, 0>,
     MatrixShape<0, 0>,
     WarpCount::kK
   >;
 };


 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
                       layout::RowMajor, ElementB_, layout::ColumnMajor,
                       ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
                      > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = GemmShape<1, 1, 1>;
   using ElementA = ElementA_;
   using LayoutA = layout::RowMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::ColumnMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassSimt;
   static int const PartitionsK = Shape::kK / WarpShape::kK;

   using Operator = Operator_;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     PartitionsK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kElementsPerAccess = 1;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::ColumnMajor;
   using SmemLayoutB = layout::RowMajor;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
     layout::PitchLinearShape<Shape::kK, Shape::kM>,
     kThreads,
     kElementsPerAccess
   >;

   using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     1,
     SmemThreadMapA // was IteratorThreadMapA
   >;

   using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
     layout::PitchLinearShape<Shape::kK, Shape::kN>,
     kThreads,
     kElementsPerAccess
   >;

   using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     SmemThreadMapB // was IteratorThreadMapA
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level op
   static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
   static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
   static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
   static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
   static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
   static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
   static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
   static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
   static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);

   static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
   static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);

   // these should have max of thread tile also
   using LaneMmaShape = cutlass::gemm::GemmShape<
       LaneM,
       LaneN,
       1>;
   using Policy = cutlass::gemm::warp::MmaSimtPolicy<
       cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
       cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
       LaneMmaShape
   >;

   using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
       WarpShape,
       ElementA,
       SmemLayoutA,
       ElementB,
       SmemLayoutB,
       ElementC,
       LayoutC,
       Policy
   >;

   using MmaPolicy = MmaPolicy<
     MmaWarpSimt,
     MatrixShape<kPaddingN, 0>,    // skew for A matrix to avoid SMEM bank conflicts
     MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
     WarpCount::kK
   >;
 };


 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
                       layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
                       LayoutC_, arch::OpClassSimt, 2, Operator_
                      > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = GemmShape<1, 1, 1>;
   using ElementA = ElementA_;
   using LayoutA = layout::RowMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::RowMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassSimt;
   static int const PartitionsK = Shape::kK / WarpShape::kK;

   using Operator = Operator_;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     PartitionsK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kElementsPerAccess = 1;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::ColumnMajor;
   using SmemLayoutB = layout::RowMajor;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
     layout::PitchLinearShape<Shape::kK, Shape::kM>,
     kThreads,
     kElementsPerAccess
   >;

   using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     1,
     SmemThreadMapA
   >;

   using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
     layout::PitchLinearShape<Shape::kN, Shape::kK>,
     kThreads,
     kElementsPerAccess
   >;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     IteratorThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level op
   static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
   static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
   static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
   static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
   static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
   static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
   static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
   static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
   static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);

   static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);

   // these should have max of thread tile also
   using LaneMmaShape = cutlass::gemm::GemmShape<
       LaneM,
       LaneN,
       1>;
   using Policy = cutlass::gemm::warp::MmaSimtPolicy<
       cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
       cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
       LaneMmaShape
   >;

   using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
       WarpShape,
       ElementA,
       SmemLayoutA,
       ElementB,
       SmemLayoutB,
       ElementC,
       LayoutC,
       Policy
   >;

   using MmaPolicy = MmaPolicy<
     MmaWarpSimt,
     MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
     MatrixShape<0, 0>,
     WarpCount::kK
   >;
 };


 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
                       layout::ColumnMajor, ElementB_, layout::ColumnMajor,
                       ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
                      > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = GemmShape<1, 1, 1>;
   using ElementA = ElementA_;
   using LayoutA = layout::ColumnMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::ColumnMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassSimt;
   static int const PartitionsK = Shape::kK / WarpShape::kK;

   using Operator = Operator_;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     PartitionsK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kElementsPerAccess = 1;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::ColumnMajor;
   using SmemLayoutB = layout::RowMajor;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
     layout::PitchLinearShape<Shape::kM, Shape::kK>,
     kThreads,
     kElementsPerAccess
   >;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     1,
     IteratorThreadMapA
   >;

   using IteratorThreadMapB =  transform::PitchLinearStripminedThreadMap<
     layout::PitchLinearShape<Shape::kK, Shape::kN>,
     kThreads,
     kElementsPerAccess
   >;

   using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     SmemThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level op
   static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
   static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
   static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
   static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
   static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
   static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
   static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
   static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
   static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);

   static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);

   // these should have max of thread tile also
   using LaneMmaShape = cutlass::gemm::GemmShape<
       LaneM,
       LaneN,
       1>;
   using Policy = cutlass::gemm::warp::MmaSimtPolicy<
       cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
       cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
       LaneMmaShape
   >;

   using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
       WarpShape,
       ElementA,
       SmemLayoutA,
       ElementB,
       SmemLayoutB,
       ElementC,
       LayoutC,
       Policy
   >;

   using MmaPolicy = MmaPolicy<
     MmaWarpSimt,
     MatrixShape<0, 0>,
     MatrixShape<0, kPaddingN>, // skew for B matrix to avoid SMEM bank conflicts
     WarpCount::kK
   >;
 };


 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
                       layout::ColumnMajor, int8_t, layout::RowMajor, ElementC_,
                       LayoutC_, arch::OpClassSimt, 2, Operator_
                     > {

   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = GemmShape<1, 1, 4>;
   using ElementA = int8_t;
   using LayoutA = layout::ColumnMajor;
   using ElementB = int8_t;
   using LayoutB = layout::RowMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassSimt;
   static int const PartitionsK = Shape::kK / WarpShape::kK;

   using Operator = Operator_;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     PartitionsK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
   using SmemLayoutB = layout::RowMajorInterleaved<4>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
     layout::PitchLinearShape<Shape::kM, Shape::kK>,
     kThreads,
     layout::PitchLinearShape<4, 4>
   >;

   using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     1,
     IteratorThreadMapA
   >;


   using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
     layout::PitchLinearShape<Shape::kN, Shape::kK>,
     kThreads,
     layout::PitchLinearShape<4, 4>
   >;

   using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     IteratorThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level op
   static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
   static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
   static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
   static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
   static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
   static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
   static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
   static const int LaneM = cutlass::const_min(4, ThreadTileM);
   static const int LaneN = cutlass::const_min(4, ThreadTileN);
   // these should have max of thread tile also
   using LaneMmaShape = cutlass::gemm::GemmShape<
       LaneM,
       LaneN,
       4>;

   using Policy = cutlass::gemm::warp::MmaSimtPolicy<
       cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
       cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
       LaneMmaShape
   >;

   using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
     WarpShape,
     ElementA,
     SmemLayoutA,
     ElementB,
     SmemLayoutB,
     ElementC,
     LayoutC,
     Policy,
     PartitionsK
     >;

   using MmaPolicy = MmaPolicy<
     MmaWarpSimt,
     MatrixShape<0, 0>,
     MatrixShape<0, 0>,
     WarpCount::kK
   >;
 };

 //
 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
                       layout::RowMajor, int8_t, layout::ColumnMajor, ElementC_,
                       LayoutC_, arch::OpClassSimt, 2, Operator_
                       > {

   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = GemmShape<1, 1, 4>;
   using ElementA = int8_t;
   using LayoutA = layout::RowMajor;
   using ElementB = int8_t;
   using LayoutB = layout::ColumnMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassSimt;
   static int const PartitionsK = Shape::kK / WarpShape::kK;

   using Operator = Operator_;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     PartitionsK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
   using SmemLayoutB = layout::RowMajorInterleaved<4>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
     layout::PitchLinearShape<Shape::kK, Shape::kM>,
     kThreads,
     layout::PitchLinearShape<4, 4>
   >;

   using SmemThreadMapA = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapA>;

   using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     1,
     SmemThreadMapA
   >;


   using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
     layout::PitchLinearShape<Shape::kK, Shape::kN>,
     kThreads,
     layout::PitchLinearShape<4, 4>
   >;

   using SmemThreadMapB = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapB>;

   using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     SmemThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level op
   static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
   static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
   static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
   static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
   static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
   static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
   static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
   static const int LaneM = cutlass::const_min(4, ThreadTileM);
   static const int LaneN = cutlass::const_min(4, ThreadTileN);
   // these should have max of thread tile also
   using LaneMmaShape = cutlass::gemm::GemmShape<
       LaneM,
       LaneN,
       4>;

   using Policy = cutlass::gemm::warp::MmaSimtPolicy<
       cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
       cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
       LaneMmaShape
   >;

   using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
     WarpShape,
     ElementA,
     SmemLayoutA,
     ElementB,
     SmemLayoutB,
     ElementC,
     LayoutC,
     Policy,
     PartitionsK
     >;

   static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
   static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);

   using MmaPolicy = MmaPolicy<
     MmaWarpSimt,
     MatrixShape<kPaddingM, 0>,
     MatrixShape<0, kPaddingN>,
     WarpCount::kK
   >;
 };

 //
 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
                       layout::RowMajor, int8_t, layout::RowMajor, ElementC_,
                       LayoutC_, arch::OpClassSimt, 2, Operator_
                       > {

   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = GemmShape<1, 1, 4>;
   using ElementA = int8_t;
   using LayoutA = layout::RowMajor;
   using ElementB = int8_t;
   using LayoutB = layout::RowMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassSimt;
   static int const PartitionsK = Shape::kK / WarpShape::kK;

   using Operator = Operator_;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     PartitionsK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
   using SmemLayoutB = layout::RowMajorInterleaved<4>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
     layout::PitchLinearShape<Shape::kK, Shape::kM>,
     kThreads,
     layout::PitchLinearShape<4, 4>
   >;

   using SmemThreadMapA = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapA>;

   using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     1,
     SmemThreadMapA
   >;

   using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
     layout::PitchLinearShape<Shape::kN, Shape::kK>,
     kThreads,
     layout::PitchLinearShape<4, 4>
   >;

   using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     IteratorThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level op
   static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
   static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
   static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
   static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
   static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
   static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
   static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
   static const int LaneM = cutlass::const_min(4, ThreadTileM);
   static const int LaneN = cutlass::const_min(4, ThreadTileN);
   // these should have max of thread tile also
   using LaneMmaShape = cutlass::gemm::GemmShape<
       LaneM,
       LaneN,
       4>;

   using Policy = cutlass::gemm::warp::MmaSimtPolicy<
       cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
       cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
       LaneMmaShape
   >;

   using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
     WarpShape,
     ElementA,
     SmemLayoutA,
     ElementB,
     SmemLayoutB,
     ElementC,
     LayoutC,
     Policy,
     PartitionsK
     >;

   static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
   static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);

   using MmaPolicy = MmaPolicy<
     MmaWarpSimt,
     MatrixShape<kPaddingM, 0>,
     MatrixShape<0, 0>,
     WarpCount::kK
   >;
 };

 //
 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
                       layout::ColumnMajor, int8_t, layout::ColumnMajor, ElementC_,
                       LayoutC_, arch::OpClassSimt, 2, Operator_
                       > {

   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = GemmShape<1, 1, 4>;
   using ElementA = int8_t;
   using LayoutA = layout::ColumnMajor;
   using ElementB = int8_t;
   using LayoutB = layout::ColumnMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassSimt;
   static int const PartitionsK = Shape::kK / WarpShape::kK;

   using Operator = Operator_;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     PartitionsK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
   using SmemLayoutB = layout::RowMajorInterleaved<4>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
     layout::PitchLinearShape<Shape::kM, Shape::kK>,
     kThreads,
     layout::PitchLinearShape<4, 4>
   >;

   using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     1,
     IteratorThreadMapA
   >;


   using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
     layout::PitchLinearShape<Shape::kK, Shape::kN>,
     kThreads,
     layout::PitchLinearShape<4, 4>
   >;

   using SmemThreadMapB = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapB>;

   using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     SmemThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level op
   static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
   static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
   static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
   static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
   static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
   static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
   static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
   static const int LaneM = cutlass::const_min(4, ThreadTileM);
   static const int LaneN = cutlass::const_min(4, ThreadTileN);
   // these should have max of thread tile also
   using LaneMmaShape = cutlass::gemm::GemmShape<
       LaneM,
       LaneN,
       4>;

   using Policy = cutlass::gemm::warp::MmaSimtPolicy<
       cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
       cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
       LaneMmaShape
   >;

   using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
     WarpShape,
     ElementA,
     SmemLayoutA,
     ElementB,
     SmemLayoutB,
     ElementC,
     LayoutC,
     Policy,
     PartitionsK
     >;

   static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
   static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);

   using MmaPolicy = MmaPolicy<
     MmaWarpSimt,
     MatrixShape<0, 0>,
     MatrixShape<0, kPaddingN>,
     WarpCount::kK
   >;
 };

 } // namespace threadblock
 } // namespace gemm
 } // namespace cutlass
cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_simt.h:756

mma_simt_policy.h
Describes the lane policy used by warp-level matrix multiply operators targeting SIMT instructions...

cutlass::MatrixShape
Describes the size of a matrix tile.
Definition: matrix_shape.h:42

regular_tile_iterator_pitch_linear.h
Templates implementing loading of tiles from pitch-linear rank=2 tensors.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaWarpSimt, MatrixShape< kPaddingM, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_simt.h:1195

cutlass
Definition: aligned_buffer.h:35

cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap
Definition: pitch_linear_thread_map.h:623

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::ElementA
ElementA_ ElementA
Definition: default_mma_core_simt.h:108

constexpr
#define constexpr
Definition: platform.h:137

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_simt.h:435

cutlass::gemm::warp::WarpSize
Query the number of threads per warp.
Definition: gemm/warp/mma.h:43

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_simt.h:755

cutlass::gemm::threadblock::DefaultMmaCore
Definition: default_mma_core.h:90

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaWarpSimt, MatrixShape< 0, 0 >, MatrixShape< 0, kPaddingN >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_simt.h:719

pitch_linear_thread_map.h
Templates implementing how threads are mapped to a given tile.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_simt.h:1225

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_simt.h:1232

cutlass::transform::TransposePitchLinearThreadMapSimt
Definition: pitch_linear_thread_map.h:431

cutlass::gemm::warp::MmaSimt
Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
Definition: mma_simt.h:74

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_simt.h:749

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_simt.h:1237

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_simt.h:1078

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::OperatorClass
arch::OpClassSimt OperatorClass
Definition: default_mma_core_simt.h:1074

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::OperatorClass
arch::OpClassSimt OperatorClass
Definition: default_mma_core_simt.h:436

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_simt.h:113

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_simt.h:761

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_simt.h:910

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_simt.h:1073

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_simt.h:902

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::OperatorClass
arch::OpClassSimt OperatorClass
Definition: default_mma_core_simt.h:114

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::OperatorClass
arch::OpClassSimt OperatorClass
Definition: default_mma_core_simt.h:270

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_simt.h:748

cutlass::gemm::warp::MmaSimtPolicy
Describes the arrangement and configuration of per-lane operations in warp-level matrix multiply...
Definition: mma_simt_policy.h:46

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaWarpSimt, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_simt.h:873

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_simt.h:106

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_simt.h:1231

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_simt.h:105

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaWarpSimt, MatrixShape< kPaddingM, 0 >, MatrixShape< 0, kPaddingN >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_simt.h:1036

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_simt.h:262

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::OperatorClass
arch::OpClassSimt OperatorClass
Definition: default_mma_core_simt.h:1233

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_simt.h:597

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile
Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping.
Definition: pitch_linear_thread_map.h:713

default_mma_core.h
Defines basic properties needed by CTA-level GEMMs assuming expectations about data layout of the glo...

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_simt.h:434

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_simt.h:903

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_simt.h:118

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

cutlass::transform::threadblock::RegularTileIterator
Definition: regular_tile_iterator.h:50

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_simt.h:602

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_simt.h:428

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_simt.h:1072

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::OperatorClass
arch::OpClassSimt OperatorClass
Definition: default_mma_core_simt.h:598

cutlass::gemm::threadblock::detail::simt_transpose_padding
constexpr int simt_transpose_padding(int threads, int crosswise, int size_in_bits)
Computes padding in shared memory to perform efficient transpose without bank conflicts.
Definition: default_mma_core_simt.h:67

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_simt.h:1066

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_simt.h:261

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_simt.h:589

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaWarpSimt, MatrixShape< 0, 0 >, MatrixShape< 0, kPaddingN >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_simt.h:1355

regular_tile_iterator_pitch_linear_2dthreadtile.h
Templates implementing loading of tiles from pitch-linear rank=2 tensors.

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::gemm::threadblock::detail::simt_get_warp_threads_m
constexpr int simt_get_warp_threads_m()
Definition: default_mma_core_simt.h:62

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_simt.h:909

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_simt.h:112

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_simt.h:1065

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaWarpSimt, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Used for partial specialization.
Definition: default_mma_core_simt.h:229

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::OperatorClass
arch::OpClassSimt OperatorClass
Definition: default_mma_core_simt.h:911

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_simt.h:596

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_simt.h:440

fast_math.h
Math utilities.

cutlass::layout::ColumnMajorInterleaved< 4 >

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaWarpSimt, MatrixShape< kPaddingN, 0 >, MatrixShape< 0, kPaddingN >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_simt.h:395

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_simt.h:427

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::ElementB
ElementB_ ElementB
Definition: default_mma_core_simt.h:110

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_simt.h:268

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_simt.h:274

mma_simt.h
Templates implementing warp-level matrix multiply-accumulate operations.

cutlass::const_min
CUTLASS_HOST_DEVICE constexpr int const_min(int a, int b)
Definition: fast_math.h:219

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_simt.h:915

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaWarpSimt, MatrixShape< kPaddingM, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_simt.h:557

cutlass::transform::threadblock::RegularTileIterator2dThreadTile
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:59

cutlass.h
Basic include for CUTLASS.

cutlass::transform::PitchLinearStripminedThreadMap
Definition: pitch_linear_thread_map.h:59

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_simt.h:269

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::OperatorClass
arch::OpClassSimt OperatorClass
Definition: default_mma_core_simt.h:757

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 4 >, int8_t, layout::ColumnMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_simt.h:1224

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_simt.h:590

cutlass::layout::RowMajorInterleaved
Definition: layout/matrix.h:237