CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
Public Types | Public Member Functions | Static Public Attributes | List of all members
cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ > Class Template Reference

#include <mma_tensor_op_tile_iterator.h>

Public Types

using Shape = Shape_
 Shape of tile to load (concept: PitchLinearShape) More...
 
using Element = Element_
 Element type. More...
 
using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>
 Layout of source tile. More...
 
using InstructionShape = InstructionShape_
 Shape of one matrix product operation (concept: MatrixShape) More...
 
using TensorRef = TensorRef< Element, Layout >
 TensorRef type for loading element from a tensor. More...
 
using Index = typename TensorRef::Index
 Index type. More...
 
using LongIndex = typename TensorRef::LongIndex
 Long Index type. More...
 
using TensorCoord = typename TensorRef::TensorCoord
 Coordinate for an element in the tensor. More...
 
using Base = MmaTensorOpMultiplicandTileIterator< layout::PitchLinearShape< Shape::kRow, Shape::kColumn >, kOperand, Element, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, layout::PitchLinearShape< InstructionShape::kRow, InstructionShape::kColumn >, kOpDelta, kThreads, PartitionsK_ >
 Underlying tile iterator implementation. More...
 
using Fragment = Array< Element, Shape::kCount/kThreads >
 Fragment object holding a thread's part of a tile. More...
 

Public Member Functions

CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator ()
 Default ctor constructs null iterator. More...
 
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator (TensorRef const &ref, int lane_id)
 Constructor from TensorRef. More...
 
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIteratoradd_pointer_offset (LongIndex offset)
 Adds a pointer offset to internal pointer(s) to advance through memory. More...
 
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIteratoradd_tile_offset (TensorCoord const &tile_offset)
 Advances an iterator along logical dimensions of matrix in units of whole tiles. More...
 
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIteratoroperator++ ()
 Advances the iterator along the advance dimension. More...
 
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIteratoroperator-- ()
 Advances the iterator along the advance dimension. More...
 
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIteratoroperator+= (TensorCoord const &tile_offset)
 advances in units of whole tiles along the logical coordinate space of the tensor More...
 
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIteratoroperator-= (TensorCoord const &tile_offset)
 
CUTLASS_HOST_DEVICE void load (Fragment &frag) const
 Loads a fragment from memory at the location pointed to by the iterator. More...
 
CUTLASS_DEVICE void load_with_pointer_offset (Fragment &frag, Index pointer_offset) const
 Loads a fragment from memory with additional logical offset. More...
 
CUTLASS_DEVICE void load_with_byte_offset (Fragment &frag, Index byte_offset) const
 Loads a fragment from memory with additional logical offset. More...
 
CUTLASS_DEVICE void load (Fragment &frag, TensorCoord const &tile_offset) const
 Loads a fragment from memory with logical offset in units of whole tiles. More...
 
CUTLASS_DEVICE void load (Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
 Loads a fragment from memory with logical offset in units of whole tiles. More...
 
CUTLASS_DEVICE void load_with_byte_offset (Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
 Loads a fragment from memory with logical offset in units of whole tiles. More...
 
CUTLASS_DEVICE void set_kgroup_index (int k_group)
 

Static Public Attributes

static Operand const kOperand = Operand_
 Operand tag. More...
 
static int const kOpDelta = OpDelta_
 Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) More...
 
static int const kThreads = 32
 Number of participating threads. More...
 

Detailed Description

template<typename Shape_, Operand Operand_, typename Element_, typename InstructionShape_, int OpDelta_, int PartitionsK_>
class cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >

This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared memory and therefore must be initialized with a TensorRef to shared memory.

Satisfies: ReadableRandomAccessContiguousTileIteratorConcept

Member Typedef Documentation

template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Base = MmaTensorOpMultiplicandTileIterator< layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element, layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>, layout::PitchLinearShape<InstructionShape::kRow, InstructionShape::kColumn>, kOpDelta, kThreads, PartitionsK_>
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Element = Element_
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Fragment = Array<Element, Shape::kCount / kThreads>
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Index = typename TensorRef::Index
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::InstructionShape = InstructionShape_
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::LongIndex = typename TensorRef::LongIndex
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Shape = Shape_
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::TensorCoord = typename TensorRef::TensorCoord
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::TensorRef = TensorRef<Element, Layout>

Constructor & Destructor Documentation

template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_HOST_DEVICE cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator ( )
inline
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_HOST_DEVICE cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::MmaTensorOpMultiplicandTileIterator ( TensorRef const &  ref,
int  lane_id 
)
inline

Member Function Documentation

template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator& cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_pointer_offset ( LongIndex  offset)
inline
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator& cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::add_tile_offset ( TensorCoord const &  tile_offset)
inline
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_HOST_DEVICE void cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load ( Fragment frag) const
inline
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_DEVICE void cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load ( Fragment frag,
TensorCoord const &  tile_offset 
) const
inline
Parameters
fragfragment to load from the tensor
tile_offsetloads a tile with a logical offset in units of whole tiles
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_DEVICE void cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load ( Fragment frag,
TensorCoord const &  tile_offset,
Index  pointer_offset 
) const
inline
Parameters
fragfragment to load from the tensor
tile_offsetloads a tile with a logical offset in units of whole tiles
pointer_offsetloads a tile with a logical offset AND a pointer offset
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_DEVICE void cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset ( Fragment frag,
Index  byte_offset 
) const
inline
Parameters
fragfragment to load from the tensor
byte_offsetloads a tile with a linear offset
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_DEVICE void cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_byte_offset ( Fragment frag,
TensorCoord const &  tile_offset,
Index  byte_offset 
) const
inline
Parameters
fragfragment to load from the tensor
tile_offsetloads a tile with a logical offset in units of whole tiles
byte_offsetloads a tile with a logical offset AND a pointer offset
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_DEVICE void cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::load_with_pointer_offset ( Fragment frag,
Index  pointer_offset 
) const
inline
Parameters
fragfragment to load from the tensor
pointer_offsetloads a tile with a linear offset
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator& cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator++ ( )
inline
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator& cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator+= ( TensorCoord const &  tile_offset)
inline
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator& cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator-- ( )
inline

advances in units of whole tiles along the logical coordinate space of the tensor

template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator& cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::operator-= ( TensorCoord const &  tile_offset)
inline
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
CUTLASS_DEVICE void cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::set_kgroup_index ( int  k_group)
inline

Notify the iterator which k-group it is currently pointing to.

This does not advance the iterator. Rather, it overrides its internal tracking with constant-valued k-group index to enable the compiler to fold constants and achieve more efficient code.

This is used by some nontrivial permuted layouts.

Member Data Documentation

template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
int const cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::kOpDelta = OpDelta_
static
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
Operand const cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::kOperand = Operand_
static
template<typename Shape_ , Operand Operand_, typename Element_ , typename InstructionShape_ , int OpDelta_, int PartitionsK_>
int const cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, InstructionShape_, OpDelta_, 32, PartitionsK_ >::kThreads = 32
static

The documentation for this class was generated from the following file: