Namespaces
	detail

	kernel

	thread

Classes
struct	BlockForEach

struct	Gemm

struct	Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd >
	Partial specialization for multiply-add. More...

struct	Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate >
	Partial specialization for multiply-add-saturate. More...

struct	Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc >
	Partial specialization for XOR-popc. More...

struct	TensorDiagonalForEach
	Launches a kernel calling a functor for each element along a tensor's diagonal. More...

struct	TensorForEach
	Launches a kernel calling a functor for each element in a tensor's index space. More...

Functions
template<typename ElementA , typename LayoutA , typename ElementB , typename LayoutB , typename ElementC , typename LayoutC , typename ScalarType , typename AccumulatorType , typename InnerProductOp = multiply_add<AccumulatorType>, typename ConvertOp = NumericConverter<ElementC, ScalarType>>
void	compute_gemm (gemm::GemmCoord problem_size, ScalarType alpha, TensorRef< ElementA, LayoutA > tensor_a, TensorRef< ElementB, LayoutB > tensor_b, ScalarType beta, TensorRef< ElementC, LayoutC > tensor_c, TensorRef< ElementC, LayoutC > tensor_d, AccumulatorType initial_accum)

template<typename ElementA , typename LayoutA , typename ElementB , typename LayoutB , typename ElementC , typename LayoutC , typename ScalarType , typename AccumulatorType , typename InnerProductOp = multiply_add<AccumulatorType>, typename ConvertOp = NumericConverter<ElementC, ScalarType>>
void	compute_gemm (gemm::GemmCoord problem_size, ScalarType alpha, TensorRef< ElementA, LayoutA > tensor_a, TensorRef< ElementB, LayoutB > tensor_b, ScalarType beta, TensorRef< ElementC, LayoutC > tensor_c, AccumulatorType initial_accum)

template<typename TensorRefCollectionA , typename TensorRefCollectionB , typename TensorRefCollectionC , typename ScalarType , typename AccumulatorType , typename InnerProductOp , typename ConvertOp >
void	BatchedGemm (gemm::GemmCoord problem_size, int batch_count, ScalarType alpha, TensorRefCollectionA const &tensor_a, TensorRefCollectionB const &tensor_b, ScalarType beta, TensorRefCollectionC &tensor_c, AccumulatorType initial_accum)
	Computes a batch of GEMMs over a set of matrices of common dimension. More...

template<typename TensorRefCollectionA , typename TensorRefCollectionB , typename TensorRefCollectionC , typename ScalarType , typename AccumulatorType >
void	BatchedGemm (gemm::GemmCoord problem_size, int batch_count, ScalarType alpha, TensorRefCollectionA const &tensor_a, TensorRefCollectionB const &tensor_b, ScalarType beta, TensorRefCollectionC &tensor_c)

template<typename Element >
bool	BlockCompareEqual (Element const ptr_A, Element const ptr_B, size_t capacity, int grid_size=0, int block_size=0)
	Performs a bit-level equality check between two blocks. More...

template<typename Element >
bool	BlockCompareRelativelyEqual (Element const ptr_A, Element const ptr_B, size_t capacity, Element epsilon, Element nonzero_floor, int grid_size=0, int block_size=0)
	Performs a bit-level equality check between two blocks. More...

template<typename Element , typename Layout >
void	TensorFillRandomGaussian (TensorView< Element, Layout > view, uint64_t seed, Element mean=Element(0), Element stddev=Element(1), int bits=-1)
	Fills a tensor with random values with a Gaussian distribution. More...

template<typename Element >
void	BlockFillRandomGaussian (Element *ptr, size_t capacity, uint64_t seed, Element mean=Element(0), Element stddev=Element(1), int bits=-1)
	Fills a tensor with random values with a Gaussian distribution. More...

template<typename Element , typename Layout >
void	TensorFillRandomUniform (TensorView< Element, Layout > view, uint64_t seed, Element max=Element(1), Element min=Element(0), int bits=-1)
	Fills a tensor with random values with a uniform random distribution. More...

template<typename Element >
void	BlockFillRandomUniform (Element *ptr, size_t capacity, uint64_t seed, Element max=Element(1), Element min=Element(0), int bits=-1)
	Fills a tensor with random values with a uniform random distribution. More...

template<typename Element , typename Layout >
void	TensorFillDiagonal (TensorView< Element, Layout > view, Element diag=Element(1), Element other=Element(0))
	Fills a tensor everywhere with a unique value for its diagonal. More...

template<typename Element , typename Layout >
void	TensorFill (TensorView< Element, Layout > view, Element val=Element(0))
	Fills a tensor with a uniform value. More...

template<typename Element , typename Layout >
void	TensorFillIdentity (TensorView< Element, Layout > view)
	Fills a tensor's diagonal with 1 and 0 everywhere else. More...

template<typename Element , typename Layout >
void	TensorUpdateDiagonal (TensorView< Element, Layout > view, Element diag=Element(1))
	Writes a uniform value to the diagonal of a tensor without modifying off-diagonal elements. More...

template<typename Element , typename Layout >
void	TensorUpdateOffDiagonal (TensorView< Element, Layout > view, Element other=Element(1))
	Writes a uniform value to all elements in the tensor without modifying diagonal elements. More...

template<typename Element , typename Layout >
void	TensorFillLinear (TensorView< Element, Layout > view, Array< Element, Layout::kRank > const &v, Element s=Element(0))
	Fills tensor with a linear combination of its coordinate and another vector. More...

template<typename Element >
void	BlockFillSequential (Element *ptr, int64_t capacity, Element v=Element(1), Element s=Element(0))
	Fills a block of data with sequential elements. More...

template<typename Element >
void	BlockFillRandom (Element *ptr, size_t capacity, uint64_t seed, Distribution dist)
	Fills a block of data with sequential elements. More...

template<typename Element , typename Layout >
void	TensorCopyDiagonalIn (TensorView< Element, Layout > view, Element const *ptr)
	Copies a diagonal in from host memory without modifying off-diagonal elements. More...

template<typename Element , typename Layout >
void	TensorCopyDiagonalOut (Element *ptr, TensorView< Element, Layout > view)
	Copies the diagonal of a tensor into a dense buffer in host memory. More...

Function Documentation

template<typename TensorRefCollectionA , typename TensorRefCollectionB , typename TensorRefCollectionC , typename ScalarType , typename AccumulatorType , typename InnerProductOp , typename ConvertOp >

void cutlass::reference::device::BatchedGemm	(	gemm::GemmCoord	problem_size,
		int	batch_count,
		ScalarType	alpha,
		TensorRefCollectionA const &	tensor_a,
		TensorRefCollectionB const &	tensor_b,
		ScalarType	beta,
		TensorRefCollectionC &	tensor_c,
		AccumulatorType	initial_accum
	)

template<typename TensorRefCollectionA , typename TensorRefCollectionB , typename TensorRefCollectionC , typename ScalarType , typename AccumulatorType >

void cutlass::reference::device::BatchedGemm	(	gemm::GemmCoord	problem_size,
		int	batch_count,
		ScalarType	alpha,
		TensorRefCollectionA const &	tensor_a,
		TensorRefCollectionB const &	tensor_b,
		ScalarType	beta,
		TensorRefCollectionC &	tensor_c
	)

Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef objects.

template<typename Element >

bool cutlass::reference::device::BlockCompareEqual	(	Element const *	ptr_A,
		Element const *	ptr_B,
		size_t	capacity,
		int	grid_size = `0`,
		int	block_size = `0`
	)

template<typename Element >

bool cutlass::reference::device::BlockCompareRelativelyEqual	(	Element const *	ptr_A,
		Element const *	ptr_B,
		size_t	capacity,
		Element	epsilon,
		Element	nonzero_floor,
		int	grid_size = `0`,
		int	block_size = `0`
	)

template<typename Element >

void cutlass::reference::device::BlockFillRandom	(	Element *	ptr,
		size_t	capacity,
		uint64_t	seed,
		Distribution	dist
	)

template<typename Element >

void cutlass::reference::device::BlockFillRandomGaussian	(	Element *	ptr,
		size_t	capacity,
		uint64_t	seed,
		Element	mean = `Element(0)`,
		Element	stddev = `Element(1)`,
		int	bits = `-1`
	)

< Element type

< If non-negative, specifies number of fractional bits that are not truncated to zero. Permits reducing precision of data.

Parameters

seed	seed for RNG
mean	Gaussian distribution's mean
stddev	Gaussian distribution's standard deviation

template<typename Element >

void cutlass::reference::device::BlockFillRandomUniform	(	Element *	ptr,
		size_t	capacity,
		uint64_t	seed,
		Element	max = `Element(1)`,
		Element	min = `Element(0)`,
		int	bits = `-1`
	)

< If non-negative, specifies number of fractional bits that are not truncated to zero. Permits reducing precision of data.

Parameters

seed	seed for RNG
max	upper bound of distribution
min	lower bound for distribution

template<typename Element >

void cutlass::reference::device::BlockFillSequential	(	Element *	ptr,
		int64_t	capacity,
		Element	v = `Element(1)`,
		Element	s = `Element(0)`
	)

template<typename ElementA , typename LayoutA , typename ElementB , typename LayoutB , typename ElementC , typename LayoutC , typename ScalarType , typename AccumulatorType , typename InnerProductOp = multiply_add<AccumulatorType>, typename ConvertOp = NumericConverter<ElementC, ScalarType>>

void cutlass::reference::device::compute_gemm	(	gemm::GemmCoord	problem_size,
		ScalarType	alpha,
		TensorRef< ElementA, LayoutA >	tensor_a,
		TensorRef< ElementB, LayoutB >	tensor_b,
		ScalarType	beta,
		TensorRef< ElementC, LayoutC >	tensor_c,
		TensorRef< ElementC, LayoutC >	tensor_d,
		AccumulatorType	initial_accum
	)

Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef objects.

Explicitly naming types needed by this template can be cumbersome, particularly for the accumulator type, so a function argument 'initial_accum' is exposed. Passing AccumulatorType(0) as the last function argument can be easier than naming all template arguments explicitly.

template<typename ElementA , typename LayoutA , typename ElementB , typename LayoutB , typename ElementC , typename LayoutC , typename ScalarType , typename AccumulatorType , typename InnerProductOp = multiply_add<AccumulatorType>, typename ConvertOp = NumericConverter<ElementC, ScalarType>>

void cutlass::reference::device::compute_gemm	(	gemm::GemmCoord	problem_size,
		ScalarType	alpha,
		TensorRef< ElementA, LayoutA >	tensor_a,
		TensorRef< ElementB, LayoutB >	tensor_b,
		ScalarType	beta,
		TensorRef< ElementC, LayoutC >	tensor_c,
		AccumulatorType	initial_accum
	)

Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef objects.

This assumes the accumulator type is the same type as the scalars.