48 namespace threadblock {
64 using Shape =
typename ThreadMap::Shape;
82 static int const kThreads = ThreadMap::kThreads;
87 ThreadMap::Iterations::kColumn *
88 ThreadMap::Iterations::kRow *
89 ThreadMap::Iterations::kGroup *
90 ThreadMap::Iterations::kCluster *
91 ThreadMap::kElementsPerAccess>;
96 ThreadMap::kElementsPerAccess,
106 uint8_t *byte_pointer_;
123 byte_pointer_(reinterpret_cast<uint8_t *>(ref.data())),
124 stride_((ref.stride(0) *
sizeof_bits<Element>::value) / 8) {
126 TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
130 thread_offset.
row() * stride_ +
133 int byte_offset = thread_offset.
row() * stride_ +
155 for (
int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
158 for (
int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
161 for (
int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
163 uint8_t
const *byte_pointer = byte_pointer_ +
164 row * ThreadMap::Delta::kRow * stride_ +
165 group * ThreadMap::Delta::kGroup* stride_ +
166 cluster * ThreadMap::Delta::kCluster * stride_ +
170 (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
175 for (
int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
177 int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:62
CUTLASS_HOST_DEVICE Index const & column() const
Returns the column of the coordinate.
Definition: matrix_coord.h:85
Array< Element, ThreadMap::Iterations::kColumn *ThreadMap::Iterations::kRow *ThreadMap::Iterations::kGroup *ThreadMap::Iterations::kCluster *ThreadMap::kElementsPerAccess > Fragment
Fragment object.
Definition: shared_load_iterator.h:91
Definition: aligned_buffer.h:35
static int const value
Definition: numeric_types.h:43
Defines a structure containing strides, bounds, and a pointer to tensor data.
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: shared_load_iterator.h:150
static int const kThreads
Definition: shared_load_iterator.h:82
Aligned array type.
Definition: array.h:511
CUTLASS_DEVICE SharedLoadIterator(TensorRef ref, int thread_idx)
Constructor.
Definition: shared_load_iterator.h:119
CUTLASS_HOST_DEVICE Index const & row() const
Returns the row of the coordinate.
Definition: matrix_coord.h:77
static int const kMinAlignment
Definition: shared_load_iterator.h:78
typename TensorRef::ConstTensorRef ConstTensorRef
Definition: shared_load_iterator.h:70
TensorRef< typename platform::remove_const< Element >::type const, Layout > ConstTensorRef
TensorRef to constant data.
Definition: tensor_ref.h:179
ThreadMap_ ThreadMap
Definition: shared_load_iterator.h:63
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:59
static int const kAlignment
Definition: shared_load_iterator.h:80
Defines a Shape template for matrix tiles.
Defines the size of an element in bits.
Definition: numeric_types.h:42
AlignedArray< Element, ThreadMap::kElementsPerAccess, kAlignment > AccessType
Memory access size.
Definition: shared_load_iterator.h:97
typename Layout::Index Index
Definition: shared_load_iterator.h:72
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
Top-level include for all CUTLASS numeric types.
Metaprogram for determining the mapping of output elements to threads for epilogue tiles...
Mapping function for row-major matrices.
Definition: layout/matrix.h:50
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment.
Definition: shared_load_iterator.h:189
Element_ Element
Definition: shared_load_iterator.h:66
Defines layout functions used by TensorRef and derived classes.
typename ThreadMap::Shape Shape
Definition: shared_load_iterator.h:64
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: shared_load_iterator.h:139
static int const kElementsPerAccess
Definition: shared_load_iterator.h:76
Definition: shared_load_iterator.h:61
typename Layout::LongIndex LongIndex
Definition: shared_load_iterator.h:73
Basic include for CUTLASS.
Definition: matrix_coord.h:39
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &offset)
Definition: shared_load_iterator.h:144