48 namespace threadblock {
60 class RegularTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment> {
66 static int const kAdvanceRank = AdvanceRank;
68 static int const kAlignment = Alignment;
76 using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
79 "Advance rank may only be along the contiguous or strided dimensions.");
100 Index increment_strided_;
103 Index increment_advance_;
115 pointer_(reinterpret_cast<uint8_t *>(ref.data()) + (ref.offset(
ThreadMap::initial_offset(thread_idx)) *
sizeof_bits<
Element>::value / 8)) {
117 stride_ = ref.
stride()[0];
134 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
139 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
141 int idx = c + s * ThreadMap::Iterations::kContiguous;
142 frag_ptr[idx] = access_ptr[c * ThreadMap::Delta::kContiguous];
145 if (s + 1 < ThreadMap::Iterations::kStrided) {
146 byte_pointer += increment_strided_;
154 load_with_pointer_offset(
156 tile_offset.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
157 tile_offset.strided() * Shape::kStrided * stride_
164 load_with_pointer_offset(frag, 0);
175 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
180 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
182 int idx = c + s * ThreadMap::Iterations::kContiguous;
183 access_ptr[c * ThreadMap::Delta::kContiguous] = frag_ptr[idx];
186 if (s + 1 < ThreadMap::Iterations::kStrided) {
187 byte_pointer += increment_strided_;
195 store_with_pointer_offset(
197 tile_offset.contiguous() * Shape::kContiguous + tile_offset.strided() * Shape::kStrided * stride_
204 store_with_pointer_offset(frag, 0);
210 pointer_ += increment_advance_;
217 pointer_ -= increment_advance_;
224 pointer_ += pointer_offset;
231 (coord.contiguous() * Shape::kContiguous + coord.strided() * Shape::kStrided * stride_) / 8;
232 add_pointer_offset(offset);
253 static int const kAdvanceRank = AdvanceRank;
255 static int const kAlignment = Alignment;
263 using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
269 (kAdvanceRank == 0 ? 1 : 0),
275 "Advance rank may only be along the row or column dimensions.");
291 iterator_({ref.
data(), ref.
stride()}, thread_idx) {
298 iterator_.load_with_pointer_offset(frag, pointer_offset);
304 iterator_.load_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
310 iterator_.load_with_pointer_offset(frag, 0);
316 iterator_.store_with_pointer_offset(frag, pointer_offset);
322 iterator_.store_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
328 iterator_.store_with_pointer_offset(frag, 0);
348 iterator_.add_pointer_offset(pointer_offset);
354 iterator_.add_tile_offset({coord.column(), coord.row()});
375 static int const kAdvanceRank = AdvanceRank;
377 static int const kAlignment = Alignment;
385 using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
391 (kAdvanceRank == 0 ? 0 : 1),
396 "Advance rank may only be along the row or column dimensions.");
412 iterator_({ref.
data(), ref.
stride()}, thread_idx) {
419 iterator_.load_with_pointer_offset(frag, pointer_offset);
425 iterator_.load_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
431 iterator_.load_with_pointer_offset(frag, 0);
437 iterator_.store_with_pointer_offset(frag, pointer_offset);
443 iterator_.store_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
449 iterator_.store_with_pointer_offset(frag, 0);
469 iterator_.add_pointer_offset(pointer_offset);
475 iterator_.add_tile_offset({coord.row(), coord.column()});
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:62
Definition: aligned_buffer.h:35
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
Defines a structure containing strides, bounds, and a pointer to tensor data.
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
Mapping function for pitch-linear memory.
Definition: pitch_linear.h:163
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:154
Aligned array type.
Definition: array.h:511
Mapping function for column-major matrices.
Definition: layout/matrix.h:142
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:59
int64_t LongIndex
Long index type used for offsets.
Definition: pitch_linear.h:175
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object's stride vector.
Definition: tensor_ref.h:277
Defines the size of an element in bits.
Definition: numeric_types.h:42
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
int32_t Index
Index type used for coordinates.
Definition: pitch_linear.h:172
Mapping function for row-major matrices.
Definition: layout/matrix.h:50
Templates implementing storing of tiles from pitch-linear rank=2 tensors.
Defines layout functions used by TensorRef and derived classes.
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:151
Basic include for CUTLASS.
Definition: matrix_coord.h:39