cutlass/default__mma__core__sm50_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"

 #include "cutlass/numeric_types.h"
 #include "cutlass/matrix_shape.h"

 #include "cutlass/layout/matrix.h"
 #include "cutlass/transform/pitch_linear_thread_map.h"
 #include "cutlass/transform/threadblock/regular_tile_iterator.h"

 #include "cutlass/gemm/warp/mma_simt.h"
 #include "cutlass/gemm/threadblock/default_mma_core.h"


 namespace cutlass {
 namespace gemm {
 namespace threadblock {


 template <
     typename Shape_,
     typename WarpShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
                       layout::ColumnMajor, ElementB_, layout::RowMajor,
                       ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_,
                       > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   using ElementA = ElementA_;
   using LayoutA = layout::ColumnMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::RowMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassSimt;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     Shape::kK / WarpShape::kK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::ColumnMajor;

   using SmemLayoutB = layout::RowMajor;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
     layout::PitchLinearShape<Shape::kM, Shape::kK>,
     kThreads,
     1
   >;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     1,
     IteratorThreadMapA
   >;

   using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
     layout::PitchLinearShape<Shape::kN, Shape::kK>,
     kThreads,
     1
   >;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     IteratorThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level tensor op
   using WarpMma = cutlass::gemm::warp::MmaSimt<
     WarpShape,
     ElementA,
     SmemLayoutA,
     ElementB,
     SmemLayoutB,
     ElementC,
     LayoutC,
     warp::MmaSimtPolicy<
       MatrixShape<4, 8>,
       layout::RowMajorInterleaved<2>,
       GemmShape<
         128 / sizeof_bits<ElementA>::value,
         128 / sizeof_bits<ElementB>::value,
         1>
       >
     >
   >;

   using MmaPolicy = MmaPolicy<
     WarpMma,
     MatrixShape<0, 0>,
     MatrixShape<0, 0>,
     WarpCount::kK
   >;
 };


 } // namespace threadblock
 } // namespace gemm
 } // namespace cutlass
cutlass::MatrixShape
Describes the size of a matrix tile.
Definition: matrix_shape.h:42

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_, >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_sm50.h:84

cutlass::gemm::warp::WarpSize
Query the number of threads per warp.
Definition: gemm/warp/mma.h:43

cutlass::gemm::threadblock::DefaultMmaCore
Definition: default_mma_core.h:90

pitch_linear_thread_map.h
Templates implementing how threads are mapped to a given tile.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_, >::MmaPolicy
MmaPolicy< WarpMma, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_sm50.h:190

cutlass::gemm::warp::MmaSimt
Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
Definition: mma_simt.h:74

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_, >::ElementB
ElementB_ ElementB
Definition: default_mma_core_sm50.h:88

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_, >::OperatorClass
arch::OpClassSimt OperatorClass
Definition: default_mma_core_sm50.h:92

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_, >::ElementC
ElementC_ ElementC
Definition: default_mma_core_sm50.h:90

cutlass::gemm::warp::MmaSimtPolicy
Describes the arrangement and configuration of per-lane operations in warp-level matrix multiply...
Definition: mma_simt_policy.h:46

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_, >::InstructionShape
InstructionShape_ InstructionShape
Definition: default_mma_core_sm50.h:85

default_mma_core.h
Defines basic properties needed by CTA-level GEMMs assuming expectations about data layout of the glo...

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

cutlass::transform::threadblock::RegularTileIterator
Definition: regular_tile_iterator.h:50

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_, >::ElementA
ElementA_ ElementA
Definition: default_mma_core_sm50.h:86

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_, >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_sm50.h:91

regular_tile_iterator.h
Templates implementing storing of tiles from pitch-linear rank=2 tensors.

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_, >::Shape
Shape_ Shape
Definition: default_mma_core_sm50.h:83

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_, >::WarpMma
cutlass::gemm::warp::MmaSimt< WarpShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, ElementC, LayoutC, warp::MmaSimtPolicy< MatrixShape< 4, 8 >, layout::RowMajorInterleaved< 2 >, GemmShape< 128/sizeof_bits< ElementA >::value, 128/sizeof_bits< ElementB >::value, 1 > > > > WarpMma
Definition: default_mma_core_sm50.h:182

mma_simt.h
Templates implementing warp-level matrix multiply-accumulate operations.

cutlass.h
Basic include for CUTLASS.

cutlass::transform::PitchLinearStripminedThreadMap
Definition: pitch_linear_thread_map.h:59

cutlass::layout::RowMajorInterleaved
Definition: layout/matrix.h:237