CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
|
#include <default_mma_core_sm70.h>
Static Public Attributes | |
static int const | kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value |
Number of threads per warp. More... | |
static int const | kThreads = WarpCount::kCount * kWarpSize |
Number of threads total. More... | |
static int const | kAccessSizeInBits = 128 |
Size of a threadblock-scoped access. More... | |
Partial specialization:
A: row-major B: row-major Operator: tensor op class
This uses the default warp-level operator given tile sizes
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementA = ElementA_ |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementB = ElementB_ |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::ElementC = ElementC_ |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::InstructionShape = GemmShape<8, 8, 4> |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads, layout::PitchLinearShape<4, 8>, kAccessSizeInBits / sizeof_bits<ElementA>::value > |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads, layout::PitchLinearShape<8, 4>, kAccessSizeInBits / sizeof_bits<ElementB>::value > |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::LayoutA = layout::RowMajor |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::LayoutB = layout::RowMajor |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::LayoutC = LayoutC_ |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaPolicy = MmaPolicy< MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>, WarpCount::kK > |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp< WarpShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, ElementC, LayoutC, Policy > |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Operator = Operator_ |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::OperatorClass = arch::OpClassTensorOp |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Policy = cutlass::gemm::warp::MmaTensorOpPolicy< cutlass::arch::Mma< cutlass::gemm::GemmShape<16, 16, 4>, 32, ElementA, LayoutA, ElementB, LayoutB, ElementC, cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd >, cutlass::MatrixShape<1, 1> > |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::Shape = Shape_ |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::SmemIteratorA = transform::threadblock::RegularTileIterator< MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0, IteratorThreadMapA > |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::SmemIteratorB = transform::threadblock::RegularTileIterator< MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0, IteratorThreadMapB > |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::SmemLayoutA = layout::RowMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits<ElementA>::value, Shape::kK> |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::SmemLayoutB = layout::RowMajorVoltaTensorOpMultiplicandBCongruous< sizeof_bits<ElementB>::value> |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::WarpCount = GemmShape< Shape::kM / WarpShape::kM, Shape::kN / WarpShape::kN, Shape::kK / WarpShape::kK > |
using cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 8, 8, 4 >, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_ >::WarpShape = WarpShape_ |
|
static |
|
static |
|
static |