CUTLASS: cutlass::epilogue::threadblock::OutputTileOptimalThreadMap< Shape_, Count_, Threads, ElementsPerAccess, ElementSize > Struct Template Reference

CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers

#include <output_tile_thread_map.h>

Classes
struct	CompactedThreadMap
	Compacted thread map in which the 4D region is contiguous. More...

struct	Detail

Public Types
using	Shape = Shape_

using	Count = Count_

using	Iterations = OutputTileShape< Detail::RowArrangement::kIterationsColumn, Detail::RowArrangement::kIterationsRow, Detail::kIterationsGroup, Detail::kIterationsCluster, 1 >

using	Delta = OutputTileShape< Detail::RowArrangement::kDeltaColumn, Detail::RowArrangement::kDeltaRow, Detail::kDeltaGroup, Detail::kDeltaCluster, 1 >

Static Public Member Functions
static CUTLASS_HOST_DEVICE MatrixCoord	initial_offset (int thread_idx)
	Initial offset function. More...

Static Public Attributes
static int const	kWarpSize = 32

static int const	kThreads = Threads

static int const	kWarpCount = kThreads / kWarpSize

static int const	kElementsPerAccess = ElementsPerAccess

static int const	kElementSize = ElementSize

Detailed Description

template<typename Shape_, typename Count_, int Threads, int ElementsPerAccess, int ElementSize>
struct cutlass::epilogue::threadblock::OutputTileOptimalThreadMap< Shape_, Count_, Threads, ElementsPerAccess, ElementSize >

Template metaprogram for partitioning a 4D space across warps to achieve several performance objectives:

coalesced memory accesses in units of 128 Byte lines
minimal address arithmetic
minimal predicate calculations

Member Typedef Documentation

template<typename Shape_ , typename Count_ , int Threads, int ElementsPerAccess, int ElementSize>

using cutlass::epilogue::threadblock::OutputTileOptimalThreadMap< Shape_, Count_, Threads, ElementsPerAccess, ElementSize >::Count = Count_

template<typename Shape_ , typename Count_ , int Threads, int ElementsPerAccess, int ElementSize>

using cutlass::epilogue::threadblock::OutputTileOptimalThreadMap< Shape_, Count_, Threads, ElementsPerAccess, ElementSize >::Delta = OutputTileShape< Detail::RowArrangement::kDeltaColumn, Detail::RowArrangement::kDeltaRow, Detail::kDeltaGroup, Detail::kDeltaCluster, 1>

template<typename Shape_ , typename Count_ , int Threads, int ElementsPerAccess, int ElementSize>

using cutlass::epilogue::threadblock::OutputTileOptimalThreadMap< Shape_, Count_, Threads, ElementsPerAccess, ElementSize >::Iterations = OutputTileShape< Detail::RowArrangement::kIterationsColumn, Detail::RowArrangement::kIterationsRow, Detail::kIterationsGroup, Detail::kIterationsCluster, 1>

template<typename Shape_ , typename Count_ , int Threads, int ElementsPerAccess, int ElementSize>

using cutlass::epilogue::threadblock::OutputTileOptimalThreadMap< Shape_, Count_, Threads, ElementsPerAccess, ElementSize >::Shape = Shape_

Member Function Documentation

template<typename Shape_ , typename Count_ , int Threads, int ElementsPerAccess, int ElementSize>

static CUTLASS_HOST_DEVICE MatrixCoord cutlass::epilogue::threadblock::OutputTileOptimalThreadMap< Shape_, Count_, Threads, ElementsPerAccess, ElementSize >::initial_offset ( int thread_idx )

inlinestatic

Member Data Documentation

template<typename Shape_ , typename Count_ , int Threads, int ElementsPerAccess, int ElementSize>

int const cutlass::epilogue::threadblock::OutputTileOptimalThreadMap< Shape_, Count_, Threads, ElementsPerAccess, ElementSize >::kElementSize = ElementSize

static

template<typename Shape_ , typename Count_ , int Threads, int ElementsPerAccess, int ElementSize>

int const cutlass::epilogue::threadblock::OutputTileOptimalThreadMap< Shape_, Count_, Threads, ElementsPerAccess, ElementSize >::kElementsPerAccess = ElementsPerAccess

static

template<typename Shape_ , typename Count_ , int Threads, int ElementsPerAccess, int ElementSize>

int const cutlass::epilogue::threadblock::OutputTileOptimalThreadMap< Shape_, Count_, Threads, ElementsPerAccess, ElementSize >::kThreads = Threads

static

template<typename Shape_ , typename Count_ , int Threads, int ElementsPerAccess, int ElementSize>

int const cutlass::epilogue::threadblock::OutputTileOptimalThreadMap< Shape_, Count_, Threads, ElementsPerAccess, ElementSize >::kWarpCount = kThreads / kWarpSize

static

template<typename Shape_ , typename Count_ , int Threads, int ElementsPerAccess, int ElementSize>

int const cutlass::epilogue::threadblock::OutputTileOptimalThreadMap< Shape_, Count_, Threads, ElementsPerAccess, ElementSize >::kWarpSize = 32

static

The documentation for this struct was generated from the following file:

output_tile_thread_map.h