#include <default_mma_core_simt.h>

Public Types
using	Shape = Shape_

using	WarpShape = WarpShape_

using	InstructionShape = GemmShape< 1, 1, 1 >

using	ElementA = ElementA_

using	LayoutA = layout::ColumnMajor

using	ElementB = ElementB_

using	LayoutB = layout::RowMajor

using	ElementC = ElementC_

using	LayoutC = LayoutC_

using	OperatorClass = arch::OpClassSimt

using	Operator = Operator_
	Default Operator. More...

using	WarpCount = GemmShape< Shape::kM/WarpShape::kM, Shape::kN/WarpShape::kN, PartitionsK >
	Number of warps present. More...

using	SmemLayoutA = layout::ColumnMajor

using	SmemLayoutB = layout::RowMajor

using	IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< layout::PitchLinearShape< Shape::kM, Shape::kK >, kThreads, kElementsPerAccess >
	ThreadMap of iterator A. More...

using	SmemIteratorA = transform::threadblock::RegularTileIterator< MatrixShape< Shape::kM, Shape::kK >, ElementA, SmemLayoutA, 1, IteratorThreadMapA >
	Shared memory iterator to A operand. More...

using	IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< layout::PitchLinearShape< Shape::kN, Shape::kK >, kThreads, kElementsPerAccess >
	Policy of iterator B. More...

using	SmemIteratorB = transform::threadblock::RegularTileIterator< MatrixShape< Shape::kK, Shape::kN >, ElementB, SmemLayoutB, 0, IteratorThreadMapB >
	Shared memory iterator to B operand. More...

using	LaneMmaShape = cutlass::gemm::GemmShape< LaneM, LaneN, 1 >

using	Policy = cutlass::gemm::warp::MmaSimtPolicy< cutlass::MatrixShape< WarpNumThreadsM, WarpNumThreadsN >, cutlass::layout::RowMajorInterleaved< LaneLayout >, LaneMmaShape >

using	MmaWarpSimt = cutlass::gemm::warp::MmaSimt< WarpShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, ElementC, LayoutC, Policy >

using	MmaPolicy = MmaPolicy< MmaWarpSimt, MatrixShape< 0, 0 >, MatrixShape< 0, 0 >, WarpCount::kK >
	Used for partial specialization. More...

Static Public Attributes
static int const	PartitionsK = Shape::kK / WarpShape::kK

static int const	kWarpSize = warp::WarpSize<arch::OpClassSimt>::value
	Number of threads per warp. More...

static int const	kThreads = WarpCount::kCount * kWarpSize
	Number of threads total. More...

static int const	kElementsPerAccess = 1

static const int	WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>()

static const int	WarpNumThreadsN = kWarpSize / WarpNumThreadsM

static const int	ThreadTileM = WarpShape::kM / WarpNumThreadsM

static const int	ThreadTileN = WarpShape::kN / WarpNumThreadsN

static const int	LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1

static const int	numElementsA = 128 / sizeof_bits<ElementA>::value

static const int	numElementsB = 128 / sizeof_bits<ElementB>::value

static const int	LaneM = cutlass::const_min(numElementsA, ThreadTileM)

static const int	LaneN = cutlass::const_min(numElementsB, ThreadTileN)

Detailed Description

template<typename Shape_, typename WarpShape_, typename ElementA_, typename ElementB_, typename ElementC_, typename LayoutC_, typename Operator_>
struct cutlass::gemm::threadblock::DefaultMmaCore< Shape_, WarpShape_, GemmShape< 1, 1, 1 >, ElementA_, layout::ColumnMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ >

Partial specialization:

A: column-major B: row-major Operator: simt class

This uses the default warp-level operator given tile sizes