CUTLASS: cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise > Struct Template Reference

CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers

#include <tensor_op_multiplicand_sm75.h>

Public Types
using	Index = int32_t
	Index type used for coordinates. More...

using	LongIndex = int64_t
	Long index type used for offsets. More...

using	TensorCoord = PitchLinearCoord
	Logical coordinate. More...

using	Stride = Coord< kStrideRank, Index, LongIndex >
	Stride vector. More...

using	TileShape = PitchLinearShape< kTileShapeContiguous, kTileShapeStride >

using	PartitionShape = PitchLinearShape< 4, 4 >
	Fundamental partition shape in units of vectors. More...

using	PartitionCount = PitchLinearShape< TileShape::kContiguous/PartitionShape::kContiguous, TileShape::kStrided/PartitionShape::kStrided >

using	AccessCount = PitchLinearShape< PartitionShape::kContiguous, PartitionShape::kStrided >

Public Member Functions
CUTLASS_HOST_DEVICE	TensorOpMultiplicand (Index ldm=0)
	Ctor. More...

CUTLASS_HOST_DEVICE	TensorOpMultiplicand (Stride stride)
	Ctor. More...

CUTLASS_HOST_DEVICE LongIndex	operator() (TensorCoord const &coord) const

CUTLASS_HOST_DEVICE Stride	stride () const
	Returns the stride of the layout. More...

CUTLASS_HOST_DEVICE Stride &	stride ()
	Returns the stride of the layout. More...

CUTLASS_HOST_DEVICE LongIndex	capacity (TensorCoord const &extent) const

Static Public Member Functions
static CUTLASS_HOST_DEVICE TensorOpMultiplicand	packed (TensorCoord const &extent)
	Helper returns a layout to a tightly packed tensor. More...

Static Public Attributes
static int const	kRank = 2
	Logical rank of tensor. More...

static int const	kStrideRank = 1
	Rank of stride vector. More...

static int const	kAccessSize = 128
	This layout is optimized for 128b accesses. More...

static int const	kElementSize = ElementSize

static int const	kElementsPerAccess = kAccessSize / kElementSize

static int const	kCrosswise = Crosswise

static int const	kTileShapeContiguous = 128 / (kAccessSize / 8)

static int const	kFactor
	Number of kblocks to store PartitionShape::kContiguous Elements. More...

static int const	kTileShapeStride

Detailed Description

template<int ElementSize, int Crosswise>
struct cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >

Template based on element size (in bits) - defined in terms of pitch-linear memory and Crosswise size (in elements).

Member Typedef Documentation

template<int ElementSize, int Crosswise>

using cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::AccessCount = PitchLinearShape<PartitionShape::kContiguous, PartitionShape::kStrided>

template<int ElementSize, int Crosswise>

using cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::Index = int32_t

template<int ElementSize, int Crosswise>

using cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::LongIndex = int64_t

template<int ElementSize, int Crosswise>

using cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::PartitionCount = PitchLinearShape<TileShape::kContiguous / PartitionShape::kContiguous, TileShape::kStrided / PartitionShape::kStrided>

template<int ElementSize, int Crosswise>

using cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::PartitionShape = PitchLinearShape<4, 4>

template<int ElementSize, int Crosswise>

using cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::Stride = Coord<kStrideRank, Index, LongIndex>

template<int ElementSize, int Crosswise>

using cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::TensorCoord = PitchLinearCoord

template<int ElementSize, int Crosswise>

using cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::TileShape = PitchLinearShape<kTileShapeContiguous, kTileShapeStride>

Fundamental tile shape in units of vectors For TN kblock=32 and 8x8x16 shapes, TileShape = <8, 4>. For the rest, TileShape = <8, 8>

Constructor & Destructor Documentation

template<int ElementSize, int Crosswise>

CUTLASS_HOST_DEVICE cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::TensorOpMultiplicand ( Index ldm = 0 )

inline

template<int ElementSize, int Crosswise>

CUTLASS_HOST_DEVICE cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::TensorOpMultiplicand ( Stride stride )

inline

Member Function Documentation

template<int ElementSize, int Crosswise>

CUTLASS_HOST_DEVICE LongIndex cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::capacity ( TensorCoord const & extent ) const

inline

Compute the number of contiguous elements needed to store a tensor with the given size

template<int ElementSize, int Crosswise>

CUTLASS_HOST_DEVICE LongIndex cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::operator() ( TensorCoord const & coord ) const

inline

Returns the offset of a coordinate in linear memory. Assumes coordinate has convention (contiguous, strided)

template<int ElementSize, int Crosswise>

static CUTLASS_HOST_DEVICE TensorOpMultiplicand cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::packed ( TensorCoord const & extent )

inlinestatic

template<int ElementSize, int Crosswise>

CUTLASS_HOST_DEVICE Stride cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::stride ( ) const

inline

template<int ElementSize, int Crosswise>

CUTLASS_HOST_DEVICE Stride& cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::stride ( )

inline

Member Data Documentation

template<int ElementSize, int Crosswise>

int const cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::kAccessSize = 128

static

template<int ElementSize, int Crosswise>

int const cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::kCrosswise = Crosswise

static

template<int ElementSize, int Crosswise>

int const cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::kElementSize = ElementSize

static

template<int ElementSize, int Crosswise>

int const cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::kElementsPerAccess = kAccessSize / kElementSize

static

template<int ElementSize, int Crosswise>

int const cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::kFactor

static

Initial value:

=

kTileShapeContiguous * kElementsPerAccess / kCrosswise

template<int ElementSize, int Crosswise>

int const cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::kRank = 2

static

template<int ElementSize, int Crosswise>

int const cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::kStrideRank = 1

static

template<int ElementSize, int Crosswise>

int const cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::kTileShapeContiguous = 128 / (kAccessSize / 8)

static

Contiguous dimension of the tile shape matches one shared memory cache line - 128B. For 128bit access size, it equals to 8 accesses.

template<int ElementSize, int Crosswise>

int const cutlass::layout::TensorOpMultiplicand< ElementSize, Crosswise >::kTileShapeStride

static

Initial value:

=
      ((kTileShapeContiguous / kFactor) > (32 / kTileShapeContiguous))
          ? (kTileShapeContiguous / kFactor)
          : (32 / kTileShapeContiguous)

The strided dimension needs to be at least WarpSize(32) / kTileShapeContiguous for a warp to access. To ensure conflict free access, it also needs to be at least (kTileShapeContiguous / kFactor).

The documentation for this struct was generated from the following file:

tensor_op_multiplicand_sm75.h