cutlass/default__mma__core__sm70_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"

 #include "cutlass/numeric_types.h"
 #include "cutlass/matrix_shape.h"


 #include "cutlass/layout/tensor_op_multiplicand_sm70.h"
 #include "cutlass/transform/pitch_linear_thread_map.h"
 #include "cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h"

 #include "cutlass/gemm/warp/mma_tensor_op_sm70.h"
 #include "cutlass/gemm/threadblock/default_mma_core.h"


 namespace cutlass {
 namespace gemm {
 namespace threadblock {


 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
                       layout::ColumnMajor, ElementB_, layout::RowMajor,
                       ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
                       > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = GemmShape<8, 8, 4>;
   using ElementA = ElementA_;
   using LayoutA = layout::ColumnMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::RowMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassTensorOp;

   using Operator = Operator_;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     Shape::kK / WarpShape::kK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kAccessSizeInBits = 128;

   //
   // Shared memory layouts
   //

   using SmemLayoutA =
     layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<
       sizeof_bits<ElementA>::value>;

   // Shared memory layout
   using SmemLayoutB =
     layout::RowMajorVoltaTensorOpMultiplicandBCongruous<
       sizeof_bits<ElementB>::value>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kM, Shape::kK>,
     kThreads,
     layout::PitchLinearShape<8, 4>,
     kAccessSizeInBits / sizeof_bits<ElementA>::value
   >;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     1,
     IteratorThreadMapA
   >;

   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kN, Shape::kK>,
     kThreads,
     layout::PitchLinearShape<8, 4>,
     kAccessSizeInBits / sizeof_bits<ElementB>::value
   >;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     IteratorThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level tensor op
   using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
     cutlass::arch::Mma<
       cutlass::gemm::GemmShape<16, 16, 4>,
       32,
       ElementA,
       LayoutA,
       ElementB,
       LayoutB,
       ElementC,
       cutlass::layout::RowMajor,
       cutlass::arch::OpMultiplyAdd
     >,
     cutlass::MatrixShape<1, 1>
   >;

   using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
     WarpShape,
     ElementA,
     SmemLayoutA,
     ElementB,
     SmemLayoutB,
     ElementC,
     LayoutC,
     Policy
   >;

   using MmaPolicy = MmaPolicy<
     MmaTensorOp,
     MatrixShape<0, 0>,
     MatrixShape<0, 0>,
     WarpCount::kK
   >;
 };

 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
                       layout::RowMajor, ElementB_, layout::ColumnMajor,
                       ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
                       > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = GemmShape<8, 8, 4>;
   using ElementA = ElementA_;
   using LayoutA = layout::RowMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::ColumnMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassTensorOp;

   using Operator = Operator_;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     Shape::kK / WarpShape::kK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kAccessSizeInBits = 128;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
       sizeof_bits<ElementA>::value, Shape::kK>;

   // Shared memory layout
   using SmemLayoutB = layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
       sizeof_bits<ElementB>::value, Shape::kK>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kK, Shape::kM>,
     kThreads,
     layout::PitchLinearShape<4, 8>,
     kAccessSizeInBits / sizeof_bits<ElementA>::value
   >;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     0,
     IteratorThreadMapA
   >;

   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kK, Shape::kN>,
     kThreads,
     layout::PitchLinearShape<4, 8>,
     kAccessSizeInBits / sizeof_bits<ElementB>::value
   >;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     1,
     IteratorThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level tensor op
   using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
     cutlass::arch::Mma<
       cutlass::gemm::GemmShape<16, 16, 4>,
       32,
       ElementA,
       LayoutA,
       ElementB,
       LayoutB,
       ElementC,
       cutlass::layout::RowMajor,
       cutlass::arch::OpMultiplyAdd
     >,
     cutlass::MatrixShape<1, 1>
   >;

   using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
     WarpShape,
     ElementA,
     SmemLayoutA,
     ElementB,
     SmemLayoutB,
     ElementC,
     LayoutC,
     Policy
   >;

   using MmaPolicy = MmaPolicy<
     MmaTensorOp,
     MatrixShape<0, 0>,
     MatrixShape<0, 0>,
     WarpCount::kK
   >;
 };


 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
                       layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
                       LayoutC_, arch::OpClassTensorOp, 2, Operator_
                       > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = GemmShape<8, 8, 4>;
   using ElementA = ElementA_;
   using LayoutA = layout::RowMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::RowMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassTensorOp;

   using Operator = Operator_;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     Shape::kK / WarpShape::kK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kAccessSizeInBits = 128;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
       sizeof_bits<ElementA>::value, Shape::kK>;

   // Shared memory layout
   using SmemLayoutB = layout::RowMajorVoltaTensorOpMultiplicandBCongruous<
       sizeof_bits<ElementB>::value>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kK, Shape::kM>,
     kThreads,
     layout::PitchLinearShape<4, 8>,
     kAccessSizeInBits / sizeof_bits<ElementA>::value
   >;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     0,
     IteratorThreadMapA
   >;

   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kN, Shape::kK>,
     kThreads,
     layout::PitchLinearShape<8, 4>,
     kAccessSizeInBits / sizeof_bits<ElementB>::value
   >;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     IteratorThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level tensor op
   using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
     cutlass::arch::Mma<
       cutlass::gemm::GemmShape<16, 16, 4>,
       32,
       ElementA,
       LayoutA,
       ElementB,
       LayoutB,
       ElementC,
       cutlass::layout::RowMajor,
       cutlass::arch::OpMultiplyAdd
     >,
     cutlass::MatrixShape<1, 1>
   >;

   using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
     WarpShape,
     ElementA,
     SmemLayoutA,
     ElementB,
     SmemLayoutB,
     ElementC,
     LayoutC,
     Policy
   >;

   using MmaPolicy = MmaPolicy<
     MmaTensorOp,
     MatrixShape<0, 0>,
     MatrixShape<0, 0>,
     WarpCount::kK
   >;
 };


 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
                       layout::ColumnMajor, ElementB_, layout::ColumnMajor,
                       ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
                       > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = GemmShape<8, 8, 4>;
   using ElementA = ElementA_;
   using LayoutA = layout::ColumnMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::ColumnMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassTensorOp;

   using Operator = Operator_;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     Shape::kK / WarpShape::kK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kAccessSizeInBits = 128;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<
       sizeof_bits<ElementA>::value>;

   // Shared memory layout
   using SmemLayoutB = layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
       sizeof_bits<ElementB>::value, Shape::kK>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kM, Shape::kK>,
     kThreads,
     layout::PitchLinearShape<8, 4>,
     kAccessSizeInBits / sizeof_bits<ElementA>::value
   >;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     1,
     IteratorThreadMapA
   >;

   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kK, Shape::kN>,
     kThreads,
     layout::PitchLinearShape<4, 8>,
     kAccessSizeInBits / sizeof_bits<ElementB>::value
   >;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     1,
     IteratorThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level tensor op
   using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
     cutlass::arch::Mma<
       cutlass::gemm::GemmShape<16, 16, 4>,
       32,
       ElementA,
       LayoutA,
       ElementB,
       LayoutB,
       ElementC,
       cutlass::layout::RowMajor,
       cutlass::arch::OpMultiplyAdd
     >,
     cutlass::MatrixShape<1, 1>
   >;

   using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
     WarpShape,
     ElementA,
     SmemLayoutA,
     ElementB,
     SmemLayoutB,
     ElementC,
     LayoutC,
     Policy
   >;

   using MmaPolicy = MmaPolicy<
     MmaTensorOp,
     MatrixShape<0, 0>,
     MatrixShape<0, 0>,
     WarpCount::kK
   >;
 };

 } // namespace threadblock
 } // namespace gemm
 } // namespace cutlass
cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::OperatorClass
arch::OpClassTensorOp OperatorClass
Definition: default_mma_core_sm70.h:247

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:630

cutlass::MatrixShape
Describes the size of a matrix tile.
Definition: matrix_shape.h:42

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementA
ElementA_ ElementA
Definition: default_mma_core_sm70.h:551

regular_tile_iterator_tensor_op_sm70.h
Templates implementing loading of tiles from pitch-linear rank=2 tensors.

cutlass::gemm::warp::WarpSize
Query the number of threads per warp.
Definition: gemm/warp/mma.h:43

mma_tensor_op_sm70.h
Templates implementing warp-level matrix multiply-accumulate operations targeting Tensor Cores...

cutlass::gemm::threadblock::DefaultMmaCore
Definition: default_mma_core.h:90

pitch_linear_thread_map.h
Templates implementing how threads are mapped to a given tile.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementA
ElementA_ ElementA
Definition: default_mma_core_sm70.h:396

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm70.h:848

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementB
ElementB_ ElementB
Definition: default_mma_core_sm70.h:398

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_sm70.h:560

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::OperatorClass
arch::OpClassTensorOp OperatorClass
Definition: default_mma_core_sm70.h:557

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementB
ElementB_ ElementB
Definition: default_mma_core_sm70.h:243

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::OperatorClass
arch::OpClassTensorOp OperatorClass
Definition: default_mma_core_sm70.h:402

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_sm70.h:548

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaTensorOp, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_sm70.h:671

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_sm70.h:91

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_sm70.h:400

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaTensorOp, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_sm70.h:208

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementB
ElementB_ ElementB
Definition: default_mma_core_sm70.h:88

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_sm70.h:83

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementB
ElementB_ ElementB
Definition: default_mma_core_sm70.h:553

default_mma_core.h
Defines basic properties needed by CTA-level GEMMs assuming expectations about data layout of the glo...

cutlass::transform::PitchLinearWarpRakedThreadMap
Definition: pitch_linear_thread_map.h:205

cutlass::gemm::warp::MmaVoltaTensorOp
Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
Definition: mma_tensor_op_sm70.h:77

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaTensorOp, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_sm70.h:361

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementA
ElementA_ ElementA
Definition: default_mma_core_sm70.h:86

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

cutlass::transform::threadblock::RegularTileIterator
Definition: regular_tile_iterator.h:50

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_sm70.h:84

cutlass::gemm::warp::MmaTensorOpPolicy
Policy.
Definition: mma_tensor_op_policy.h:48

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_sm70.h:549

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm70.h:943

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_sm70.h:556

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_sm70.h:250

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_sm70.h:394

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::OperatorClass
arch::OpClassTensorOp OperatorClass
Definition: default_mma_core_sm70.h:92

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementA
ElementA_ ElementA
Definition: default_mma_core_sm70.h:241

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_sm70.h:405

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaTensorOp, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_sm70.h:516

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_sm70.h:95

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_sm70.h:555

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_sm70.h:90

cutlass::arch::Mma
Matrix multiply-add operation.
Definition: arch/mma.h:92

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_sm70.h:401

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:191

tensor_op_multiplicand_sm70.h

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_sm70.h:239

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_sm70.h:238

cutlass.h
Basic include for CUTLASS.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_sm70.h:393

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_sm70.h:246

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_sm70.h:245