CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
|
#include <regular_tile_iterator_tensor_op.h>
Classes | |
struct | Detail |
Internal details made public to facilitate introspection. More... | |
Public Types | |
using | Shape = Shape_ |
using | Element = Element_ |
using | Layout = layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element))> |
using | Index = typename Layout::Index |
using | LongIndex = typename Layout::LongIndex |
using | TensorRef = TensorRef< Element, Layout > |
using | TensorCoord = typename Layout::TensorCoord |
using | ThreadMap = ThreadMap_ |
using | Fragment = Array< Element, ThreadMap::Iterations::kCount *Layout::kElementsPerAccess > |
Fragment object to be loaded or stored. More... | |
using | TileAccessIterator = RegularTileAccessIterator< Shape, Element, Layout, kAdvanceRank, ThreadMap > |
Underlying iterator to compute the addresses. More... | |
Public Member Functions | |
CUTLASS_HOST_DEVICE | RegularTileIterator (TensorRef ref, int thread_id) |
Construct a TileIterator with zero threadblock offset. More... | |
CUTLASS_HOST_DEVICE void | add_pointer_offset (LongIndex pointer_offset) |
Adds a pointer offset in units of Element. More... | |
CUTLASS_HOST_DEVICE RegularTileIterator & | operator++ () |
Advances to the next tile in memory. More... | |
CUTLASS_HOST_DEVICE RegularTileIterator | operator++ (int) |
Advances to the next tile in memory. More... | |
CUTLASS_DEVICE void | add_tile_offset (TensorCoord const &coord) |
Adds a tile offset. More... | |
CUTLASS_DEVICE void | load_with_pointer_offset (Fragment &frag, Index pointer_offset) |
Loads a fragment from memory. More... | |
CUTLASS_DEVICE void | load (Fragment &frag) |
Loads a fragment from memory. More... | |
CUTLASS_DEVICE void | store_with_pointer_offset (Fragment const &frag, Index pointer_offset) |
Store a fragment to memory. More... | |
CUTLASS_DEVICE void | store (Fragment const &frag) |
Store a fragment to memory. More... | |
Static Public Attributes | |
static int const | kAdvanceRank = AdvanceRank |
static int const | kAlignment = Alignment |
Tile iterator specialized for congruous arrangements for TensorOps
Satisfies: ForwardTileIteratorConcept | ReadableContiguousTileIteratorConcept | WriteableContiguousTileIteratorConcept
using cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Element = Element_ |
using cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess> |
using cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Index = typename Layout::Index |
using cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Layout = layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value, int(128 / sizeof(Element))> |
using cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::LongIndex = typename Layout::LongIndex |
using cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Shape = Shape_ |
using cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::TensorCoord = typename Layout::TensorCoord |
using cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::TensorRef = TensorRef<Element, Layout> |
using cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::ThreadMap = ThreadMap_ |
using cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout, kAdvanceRank, ThreadMap> |
|
inline |
ref | Pointer to start of tensor |
thread_id | ID of each participating thread |
|
inline |
|
inline |
|
inline |
|
inline |
|
inline |
|
inline |
|
inline |
|
inline |
|
static |
|
static |