CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
|
#include <mma_tensor_op_tile_iterator.h>
Classes | |
struct | Policy |
Internal structure of iterator - made public to enable introspection. More... | |
Public Types | |
using | Shape = Shape_ |
Shape of tile to load (concept: MatrixShape) More... | |
using | Element = Element_ |
Element type. More... | |
using | Layout = cutlass::layout::ColumnMajor |
Layout of source tile. More... | |
using | InstructionShape = InstructionShape_ |
Shape of one matrix product operation (concept: MatrixShape) More... | |
using | OpDelta = OpDelta_ |
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) More... | |
using | TensorRef = TensorRef< Element, Layout > |
TensorRef type for loading element from a tensor. More... | |
using | Index = typename TensorRef::Index |
Index type. More... | |
using | LongIndex = typename TensorRef::LongIndex |
Long Index type. More... | |
using | TensorCoord = typename TensorRef::TensorCoord |
Coordinate for an element in the tensor. More... | |
using | Fragment = Array< Element, Shape::kCount/kThreads > |
Fragment object holding a thread's part of a tile. More... | |
Public Member Functions | |
CUTLASS_HOST_DEVICE | MmaTensorOpAccumulatorTileIterator () |
Default ctor constructs null iterator. More... | |
CUTLASS_HOST_DEVICE | MmaTensorOpAccumulatorTileIterator (TensorRef const &ref, int lane_id) |
Constructor from TensorRef. More... | |
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & | add_pointer_offset (LongIndex offset) |
Adds a pointer offset to internal pointer(s) to advance through memory. More... | |
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & | add_tile_offset (TensorCoord const &tile_offset) |
Advances an iterator along logical dimensions of matrix in units of whole tiles. More... | |
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & | operator++ () |
Advances the iterator along the advance dimension. More... | |
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & | operator-- () |
Advances the iterator along the advance dimension. More... | |
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & | operator+= (TensorCoord const &tile_offset) |
advances in units of whole tiles along the logical coordinate space of the tensor More... | |
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & | operator-= (TensorCoord const &tile_offset) |
CUTLASS_HOST_DEVICE void | load (Fragment &frag) const |
Loads a fragment from memory at the location pointed to by the iterator. More... | |
CUTLASS_DEVICE void | load_with_pointer_offset (Fragment &frag, Index pointer_offset) const |
Loads a fragment from memory with additional logical offset. More... | |
CUTLASS_DEVICE void | load_with_byte_offset (Fragment &frag, Index byte_offset) const |
Loads a fragment from memory with additional logical offset. More... | |
CUTLASS_DEVICE void | load (Fragment &frag, TensorCoord const &tile_offset) const |
Loads a fragment from memory with logical offset in units of whole tiles. More... | |
CUTLASS_DEVICE void | load (Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const |
Loads a fragment from memory with logical offset in units of whole tiles. More... | |
CUTLASS_HOST_DEVICE void | store (Fragment const &frag) const |
Stores a fragment to memory. More... | |
CUTLASS_DEVICE void | store_with_pointer_offset (Fragment const &frag, Index pointer_offset) const |
Stores a fragment to memory with additional pointer offset. More... | |
CUTLASS_DEVICE void | store_with_byte_offset (Fragment const &frag, Index byte_offset) const |
Stores a fragment to memory with additional pointer offset. More... | |
CUTLASS_DEVICE void | store (Fragment &frag, TensorCoord const &tile_offset) const |
Stores a fragment to memory with logical offset in units of whole tiles. More... | |
CUTLASS_DEVICE void | store (Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const |
Stores a fragment from memory with logical offset in units of whole tiles. More... | |
Static Public Attributes | |
static Operand const | kOperand = Operand::kC |
Operand tag. More... | |
static int const | kThreads = 32 |
Number of participating threads. More... | |
This tile iterator is specialized for 32-thread TensorOps. It is used to load or store accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major accumulator layout.
Satisfies: ReadableRandomAccessContiguousTileIteratorConcept | WriteableRandomAccessContiguousTileIteratorConcept
using cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::Element = Element_ |
using cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::Fragment = Array<Element, Shape::kCount / kThreads> |
using cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::Index = typename TensorRef::Index |
using cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::InstructionShape = InstructionShape_ |
using cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::Layout = cutlass::layout::ColumnMajor |
using cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::LongIndex = typename TensorRef::LongIndex |
using cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::OpDelta = OpDelta_ |
using cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::Shape = Shape_ |
using cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::TensorCoord = typename TensorRef::TensorCoord |
using cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< Shape_, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_ >::TensorRef = TensorRef<Element, Layout> |
|
inline |
|
inline |
|
inline |
|
inline |
|
inline |
|
inline |
< loads a tile with a logical offset in units of whole tiles
frag | fragment to load from the tensor |
|
inline |
< loads a tile with a logical offset AND a pointer offset
frag | fragment to load from the tensor |
tile_offset | loads a tile with a logical offset in units of whole tiles |
|
inline |
< loads a tile with a linear offset
frag | fragment to load from the tensor |
|
inline |
< loads a tile with a linear offset
frag | fragment to load from the tensor |
|
inline |
|
inline |
|
inline |
advances in units of whole tiles along the logical coordinate space of the tensor
|
inline |
|
inline |
|
inline |
< stores a tile with a logical offset in units of whole tiles
frag | fragment to store to the tensor |
|
inline |
frag | fragment to store to the tensor |
tile_offset | stores a tile with a logical offset in units of whole tiles |
pointer_offset | stores a tile with a logical offset AND a pointer offset |
|
inline |
< store a tile with a linear offset
frag | fragment to store from the tensor |
|
inline |
< store a tile with a linear offset
frag | fragment to store from the tensor |
|
static |
|
static |