cutlass/default__mma__core__sm75_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
 #include "cutlass/platform/platform.h"

 #include "cutlass/numeric_types.h"
 #include "cutlass/matrix_shape.h"

 #include "cutlass/layout/tensor_op_multiplicand_sm75.h"
 #include "cutlass/transform/pitch_linear_thread_map.h"
 #include "cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h"

 #include "cutlass/gemm/warp/default_mma_tensor_op.h"
 #include "cutlass/gemm/threadblock/default_mma_core.h"


 namespace cutlass {
 namespace gemm {
 namespace threadblock {


 template <
     typename Shape_,
     typename WarpShape_,
     typename InstructionShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
                       layout::ColumnMajor, ElementB_, layout::RowMajor,
                       ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
                       > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   using ElementA = ElementA_;
   using LayoutA = layout::ColumnMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::RowMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassTensorOp;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     Shape::kK / WarpShape::kK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kAccessSizeInBits = 128;

   using Operator = Operator_;

   //
   // Shared memory layouts
   //

   using SmemLayoutA =
     layout::ColumnMajorTensorOpMultiplicandCongruous<
       sizeof_bits<ElementA>::value, int(128 / sizeof(ElementA))>;

   // Shared memory layout
   using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
     sizeof_bits<ElementB>::value, int(128 / sizeof(ElementB))>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kM, Shape::kK>,
     kThreads,
     layout::PitchLinearShape<8, 4>,
     kAccessSizeInBits / sizeof_bits<ElementA>::value
   >;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     1,
     IteratorThreadMapA
   >;

   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kN, Shape::kK>,
     kThreads,
     layout::PitchLinearShape<8, 4>,
     kAccessSizeInBits / sizeof_bits<ElementB>::value
   >;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     IteratorThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level tensor op
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
       WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
       ElementC, LayoutC, Operator, WarpCount::kK>::Type;

   using MmaPolicy = MmaPolicy<
     MmaTensorOp,
     MatrixShape<0, 0>,
     MatrixShape<0, 0>,
     WarpCount::kK
   >;
 };


 template <
     typename Shape_,
     typename WarpShape_,
     typename InstructionShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
                       layout::RowMajor, ElementB_, layout::ColumnMajor,
                       ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
                       > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   using ElementA = ElementA_;
   using LayoutA = layout::RowMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::ColumnMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassTensorOp;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     Shape::kK / WarpShape::kK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kAccessSizeInBits = 128;

   using Operator = Operator_;

   // Warp thread arrangement
   static int const kWarpThreadArrangementContiguousA =
       Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);

   static int const kWarpThreadArrangementStridedA =
       kWarpSize / kWarpThreadArrangementContiguousA;

   static int const kWarpThreadArrangementContiguousB =
       Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);

   static int const kWarpThreadArrangementStridedB =
       kWarpSize / kWarpThreadArrangementContiguousB;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
       sizeof_bits<ElementA>::value, Shape::kK>;

   // Shared memory layout
   using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
       sizeof_bits<ElementB>::value, Shape::kK>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
       layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
                                kWarpThreadArrangementStridedA>,
       kAccessSizeInBits / sizeof_bits<ElementA>::value>;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     0,
     IteratorThreadMapA
   >;

   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
       layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
                                kWarpThreadArrangementStridedB>,
       kAccessSizeInBits / sizeof_bits<ElementB>::value>;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     1,
     IteratorThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level tensor op
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
       WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
       ElementC, LayoutC, Operator, WarpCount::kK>::Type;

   using MmaPolicy = MmaPolicy<
     MmaTensorOp,
     MatrixShape<0, 0>,
     MatrixShape<0, 0>,
     WarpCount::kK
   >;
 };


 template <
     typename Shape_,
     typename WarpShape_,
     typename InstructionShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
                       layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
                       LayoutC_, arch::OpClassTensorOp, 2, Operator_
                       > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   using ElementA = ElementA_;
   using LayoutA = layout::RowMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::RowMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassTensorOp;

   using WarpCount = GemmShape<
     Shape::kM / WarpShape::kM,
     Shape::kN / WarpShape::kN,
     Shape::kK / WarpShape::kK
   >;

   // Divisility requirements
   static_assert(
     !(Shape::kM % WarpShape::kM) &&
     !(Shape::kN % WarpShape::kN),
     "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
   );

   static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kAccessSizeInBits = 128;

   using Operator = Operator_;

   // Warp thread arrangement
   static int const kWarpThreadArrangementContiguousA =
       Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);

   static int const kWarpThreadArrangementStridedA =
       kWarpSize / kWarpThreadArrangementContiguousA;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
       sizeof_bits<ElementA>::value, Shape::kK>;

   // Shared memory layout
   using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
       sizeof_bits<ElementB>::value, int(128 / sizeof(ElementB))>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
       layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
                                kWarpThreadArrangementStridedA>,
       kAccessSizeInBits / sizeof_bits<ElementA>::value>;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     ElementA,
     SmemLayoutA,
     0,
     IteratorThreadMapA
   >;

   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kN, Shape::kK>,
     kThreads,
     layout::PitchLinearShape<8, 4>,
     kAccessSizeInBits / sizeof_bits<ElementB>::value
   >;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     ElementB,
     SmemLayoutB,
     0,
     IteratorThreadMapB
   >;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level tensor op
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
       WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
       ElementC, LayoutC, Operator, WarpCount::kK>::Type;

   using MmaPolicy = MmaPolicy<
     MmaTensorOp,
     MatrixShape<0, 0>,
     MatrixShape<0, 0>,
     WarpCount::kK
   >;
 };


 template <
     typename Shape_,
     typename WarpShape_,
     typename InstructionShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_>
 struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
                       layout::ColumnMajor, ElementB_, layout::ColumnMajor,
                       ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
                       > {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   using ElementA = ElementA_;
   using LayoutA = layout::ColumnMajor;
   using ElementB = ElementB_;
   using LayoutB = layout::ColumnMajor;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassTensorOp;

   using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
                               Shape::kN / WarpShape::kN,
                               Shape::kK / WarpShape::kK>;

   // Divisility requirements
   static_assert(
       !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
       "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");

   static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kAccessSizeInBits = 128;

   using Operator = Operator_;

   // Warp thread arrangement
   static int const kWarpThreadArrangementContiguousB =
       Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);

   static int const kWarpThreadArrangementStridedB =
       kWarpSize / kWarpThreadArrangementContiguousB;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
       sizeof_bits<ElementA>::value, int(128 / sizeof(ElementA))>;

   // Shared memory layout
   using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
       sizeof_bits<ElementB>::value, Shape::kK>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
       layout::PitchLinearShape<8, 4>,
       kAccessSizeInBits / sizeof_bits<ElementA>::value>;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
       MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
       IteratorThreadMapA>;

   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
       layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
                                kWarpThreadArrangementStridedB>,
       kAccessSizeInBits / sizeof_bits<ElementB>::value>;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
       MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
       IteratorThreadMapB>;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level tensor op
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
       WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
       ElementC, LayoutC, Operator, WarpCount::kK>::Type;

   using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
                                        MatrixShape<0, 0>, WarpCount::kK>;
 };

 template <
     typename Shape_,
     typename WarpShape_,
     typename InstructionShape_,
     typename ElementA_,
     typename ElementB_,
     typename ElementC_,
     typename LayoutC_,
     typename Operator_,
     bool AccumulatorsInRowMajor,
     int InterleavedK>
 struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
                       layout::ColumnMajorInterleaved<InterleavedK>, ElementB_,
                       layout::RowMajorInterleaved<InterleavedK>, ElementC_,
                       LayoutC_, arch::OpClassTensorOp, 2, Operator_,
                       AccumulatorsInRowMajor> {
   using Shape = Shape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   using ElementA = ElementA_;
   using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
   using ElementB = ElementB_;
   using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using OperatorClass = arch::OpClassTensorOp;
   static int const kInterleavedK = InterleavedK;

   using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
                               Shape::kN / WarpShape::kN,
                               Shape::kK / WarpShape::kK>;

   // Divisility requirements
   static_assert(
       !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
       "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");

   static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;

   static int const kThreads = WarpCount::kCount * kWarpSize;

   static int const kAccessSizeInBits = 128;

   using Operator = Operator_;

   // Warp thread arrangement
   static int const kElementsPerAccess =
       kAccessSizeInBits / sizeof_bits<ElementA>::value;

   static int const kWarpThreadArrangementContiguous =
       kInterleavedK / kElementsPerAccess;

   static int const kWarpThreadArrangementStrided =
       kWarpSize / kWarpThreadArrangementContiguous;

   //
   // Shared memory layouts
   //

   using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
       sizeof_bits<ElementA>::value, kInterleavedK>;

   // Shared memory layout
   using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
       sizeof_bits<ElementB>::value, kInterleavedK>;

   //
   // Iterators to write to shared memory
   //

   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kM * kInterleavedK,
                                Shape::kK / kInterleavedK>,
       kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;

   using SmemThreadMapA = transform::TransposePitchLinearThreadMap<
       IteratorThreadMapA,
       layout::PitchLinearShape<kWarpThreadArrangementContiguous,
                                kWarpThreadArrangementStrided>>;

   using SmemIteratorA = transform::threadblock::RegularTileIterator<
       MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
       SmemThreadMapA>;

   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kN * kInterleavedK,
                                Shape::kK / kInterleavedK>,
       kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;

   using SmemThreadMapB = transform::TransposePitchLinearThreadMap<
       IteratorThreadMapB,
       layout::PitchLinearShape<kWarpThreadArrangementContiguous,
                                kWarpThreadArrangementStrided>>;

   using SmemIteratorB = transform::threadblock::RegularTileIterator<
       MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
       SmemThreadMapB>;

   //
   // Warp-level matrix multiply operator
   //

   // Define the warp-level tensor op
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
       WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
       ElementC, LayoutC, Operator, WarpCount::kK, AccumulatorsInRowMajor>::Type;

   using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
                                        MatrixShape<0, 0>, WarpCount::kK>;
 };


 } // namespace threadblock
 } // namespace gemm
 } // namespace cutlass
cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::OperatorClass
arch::OpClassTensorOp OperatorClass
Definition: default_mma_core_sm75.h:517

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementA
ElementA_ ElementA
Definition: default_mma_core_sm75.h:511

cutlass::MatrixShape
Describes the size of a matrix tile.
Definition: matrix_shape.h:42

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::InstructionShape
InstructionShape_ InstructionShape
Definition: default_mma_core_sm75.h:223

regular_tile_iterator_tensor_op.h
Templates implementing storing of tiles from pitch-linear rank=2 tensors.

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_sm75.h:375

default_mma_tensor_op.h
Default warp-level GEMM operators selected by data type, size, and layouts of operands.

tensor_op_multiplicand_sm75.h

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementB
ElementB_ ElementB
Definition: default_mma_core_sm75.h:372

cutlass::gemm::warp::WarpSize
Query the number of threads per warp.
Definition: gemm/warp/mma.h:43

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajorInterleaved< InterleavedK >, ElementB_, layout::RowMajorInterleaved< InterleavedK >, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_, AccumulatorsInRowMajor >::Shape
Shape_ Shape
Definition: default_mma_core_sm75.h:636

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:734

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_sm75.h:515

cutlass::gemm::threadblock::DefaultMmaCore
Definition: default_mma_core.h:90

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_sm75.h:508

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaTensorOp
typename cutlass::gemm::warp::DefaultMmaTensorOp< WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, ElementC, LayoutC, Operator, WarpCount::kK >::Type MmaTensorOp
Definition: default_mma_core_sm75.h:325

pitch_linear_thread_map.h
Templates implementing how threads are mapped to a given tile.

cutlass::gemm::warp::DefaultMmaTensorOp
Partial specialization for m-by-n-by-kgroup.
Definition: default_mma_tensor_op.h:67

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:422

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:835

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementB
ElementB_ ElementB
Definition: default_mma_core_sm75.h:513

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaTensorOp
typename cutlass::gemm::warp::DefaultMmaTensorOp< WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, ElementC, LayoutC, Operator, WarpCount::kK >::Type MmaTensorOp
Definition: default_mma_core_sm75.h:593

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::OperatorClass
arch::OpClassTensorOp OperatorClass
Definition: default_mma_core_sm75.h:94

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_sm75.h:85

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_sm75.h:256

platform.h
C++ features that may be otherwise unimplemented for CUDA device functions.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaTensorOp, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_sm75.h:187

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_sm75.h:93

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementB
ElementB_ ElementB
Definition: default_mma_core_sm75.h:226

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajorInterleaved< InterleavedK >, ElementB_, layout::RowMajorInterleaved< InterleavedK >, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_, AccumulatorsInRowMajor >::MmaPolicy
MmaPolicy< MmaTensorOp, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_sm75.h:740

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::OperatorClass
arch::OpClassTensorOp OperatorClass
Definition: default_mma_core_sm75.h:376

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementA
ElementA_ ElementA
Definition: default_mma_core_sm75.h:88

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementA
ElementA_ ElementA
Definition: default_mma_core_sm75.h:370

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementA
ElementA_ ElementA
Definition: default_mma_core_sm75.h:224

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaTensorOp, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_sm75.h:597

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::transform::TransposePitchLinearThreadMap
Definition: pitch_linear_thread_map.h:333

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementB
ElementB_ ElementB
Definition: default_mma_core_sm75.h:90

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajorInterleaved< InterleavedK >, ElementB_, layout::RowMajorInterleaved< InterleavedK >, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_, AccumulatorsInRowMajor >::ElementB
ElementB_ ElementB
Definition: default_mma_core_sm75.h:641

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_sm75.h:374

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_sm75.h:228

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::OperatorClass
arch::OpClassTensorOp OperatorClass
Definition: default_mma_core_sm75.h:230

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::InstructionShape
InstructionShape_ InstructionShape
Definition: default_mma_core_sm75.h:87

default_mma_core.h
Defines basic properties needed by CTA-level GEMMs assuming expectations about data layout of the glo...

cutlass::transform::PitchLinearWarpRakedThreadMap
Definition: pitch_linear_thread_map.h:205

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_sm75.h:229

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::InstructionShape
InstructionShape_ InstructionShape
Definition: default_mma_core_sm75.h:510

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

cutlass::transform::threadblock::RegularTileIterator
Definition: regular_tile_iterator.h:50

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_sm75.h:509

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajorInterleaved< InterleavedK >, ElementB_, layout::RowMajorInterleaved< InterleavedK >, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_, AccumulatorsInRowMajor >::ElementA
ElementA_ ElementA
Definition: default_mma_core_sm75.h:639

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_sm75.h:402

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_sm75.h:86

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_sm75.h:539

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaTensorOp, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_sm75.h:474

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaTensorOp
typename cutlass::gemm::warp::DefaultMmaTensorOp< WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, ElementC, LayoutC, Operator, WarpCount::kK >::Type MmaTensorOp
Definition: default_mma_core_sm75.h:179

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajorInterleaved< InterleavedK >, ElementB_, layout::RowMajorInterleaved< InterleavedK >, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_, AccumulatorsInRowMajor >::InstructionShape
InstructionShape_ InstructionShape
Definition: default_mma_core_sm75.h:638

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajorInterleaved< InterleavedK >, ElementB_, layout::RowMajorInterleaved< InterleavedK >, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_, AccumulatorsInRowMajor >::ElementC
ElementC_ ElementC
Definition: default_mma_core_sm75.h:643

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_sm75.h:120

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaTensorOp
typename cutlass::gemm::warp::DefaultMmaTensorOp< WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, ElementC, LayoutC, Operator, WarpCount::kK >::Type MmaTensorOp
Definition: default_mma_core_sm75.h:466

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementC
ElementC_ ElementC
Definition: default_mma_core_sm75.h:92

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::InstructionShape
InstructionShape_ InstructionShape
Definition: default_mma_core_sm75.h:369

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_sm75.h:221

cutlass::layout::ColumnMajorInterleaved
Definition: layout/matrix.h:343

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_sm75.h:222

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajorInterleaved< InterleavedK >, ElementB_, layout::RowMajorInterleaved< InterleavedK >, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_, AccumulatorsInRowMajor >::OperatorClass
arch::OpClassTensorOp OperatorClass
Definition: default_mma_core_sm75.h:645

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajorInterleaved< InterleavedK >, ElementB_, layout::RowMajorInterleaved< InterleavedK >, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_, AccumulatorsInRowMajor >::MmaTensorOp
typename cutlass::gemm::warp::DefaultMmaTensorOp< WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, ElementC, LayoutC, Operator, WarpCount::kK, AccumulatorsInRowMajor >::Type MmaTensorOp
Definition: default_mma_core_sm75.h:736

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajorInterleaved< InterleavedK >, ElementB_, layout::RowMajorInterleaved< InterleavedK >, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_, AccumulatorsInRowMajor >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_sm75.h:637

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Shape
Shape_ Shape
Definition: default_mma_core_sm75.h:367

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: default_mma_core_sm75.h:368

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajorInterleaved< InterleavedK >, ElementB_, layout::RowMajorInterleaved< InterleavedK >, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_, AccumulatorsInRowMajor >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_sm75.h:644

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::RowMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaPolicy
MmaPolicy< MmaTensorOp, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK > MmaPolicy
Policy used to define MmaPipelined.
Definition: default_mma_core_sm75.h:333

cutlass.h
Basic include for CUTLASS.

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajorInterleaved< InterleavedK >, ElementB_, layout::RowMajorInterleaved< InterleavedK >, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_, AccumulatorsInRowMajor >::Operator
Operator_ Operator
Default Operator.
Definition: default_mma_core_sm75.h:668

cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, InstructionShape_, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::LayoutC
LayoutC_ LayoutC
Definition: default_mma_core_sm75.h:516

cutlass::layout::RowMajorInterleaved
Definition: layout/matrix.h:237

cutlass::layout::RowMajorTensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:527