44 namespace threadblock {
131 int AccessSize = ThreadMap::kElementsPerAccess
144 template <
typename Shape_,
typename Element_,
int AdvanceRank,
145 typename ThreadMap_,
int AccessSize>
147 ThreadMap_, AccessSize> {
150 AdvanceRank == 0 || AdvanceRank == 1,
151 "Specialization for pitch-linear iterator may along advance along the " 152 "contiguous(rank=0) or strided(rank=1) dimension.");
155 using Element = Element_;
157 static int const kAdvanceRank = AdvanceRank;
158 using ThreadMap = ThreadMap_;
178 static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
181 using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
182 ThreadMap::kElementsPerAccess>;
185 using Mask =
typename TileAccessIterator::Mask;
194 typename TileAccessIterator::Params params_;
199 Params(Layout
const &layout) : params_(layout) { }
207 using BytePointer =
char *;
223 Params
const ¶ms,
232 : address_iterator_(params.params_, pointer, extent, thread_id,
233 threadblock_offset) {}
238 Params
const ¶ms,
249 address_iterator_.add_pointer_offset(pointer_offset);
261 address_iterator_.add_tile_offset({0, 1});
263 address_iterator_.add_tile_offset({1, 0});
300 AccessType *frag_ptr =
reinterpret_cast<AccessType *
>(&frag);
303 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
305 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
308 for (
int v = 0; v < kAccessesPerVector; ++v) {
310 int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
312 address_iterator_.set_iteration_index(idx);
313 auto ptr = (address_iterator_.get() + pointer_offset);
315 if (address_iterator_.valid()) {
316 frag_ptr[idx] = *ptr;
331 address_iterator_.set_iteration_index(0);
332 AccessType
const *frag_ptr =
reinterpret_cast<AccessType
const *
>(&frag);
335 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
337 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
339 for (
int v = 0; v < kAccessesPerVector; ++v) {
341 int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
343 if (address_iterator_.valid()) {
344 *(address_iterator_.get() + pointer_offset) = frag_ptr[idx];
377 "Specialization for pitch-linear iterator may along advance along the " 378 "contiguous(rank=0) or strided(rank=1) dimension.");
381 using Element = Element_;
383 static int const kAdvanceRank = AdvanceRank;
384 using ThreadMap = ThreadMap_;
400 (kAdvanceRank == 0 ? 0 : 1),
408 using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
411 using Mask =
typename UnderlyingIterator::Mask;
420 typename UnderlyingIterator::Params params_;
429 Params(Layout
const &layout): params_(layout::PitchLinear(layout.stride(0))) {
449 Params
const ¶ms,
458 layout::PitchLinearCoord(extent.row(), extent.column()),
460 layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
466 Params
const ¶ms,
475 iterator_.add_pointer_offset(pointer_offset);
504 iterator_.clear_mask();
510 iterator_.enable_mask();
516 iterator_.set_mask(mask);
522 iterator_.get_mask(mask);
528 iterator_.load_with_pointer_offset(frag, pointer_offset);
534 load_with_pointer_offset(frag, 0);
540 iterator_.store_with_pointer_offset(frag, pointer_offset);
546 store_with_pointer_offset(frag, 0);
570 "Specialization for pitch-linear iterator may along advance along the " 571 "contiguous(rank=0) or strided(rank=1) dimension.");
574 using Element = Element_;
576 static int const kAdvanceRank = AdvanceRank;
577 using ThreadMap = ThreadMap_;
593 (kAdvanceRank == 0 ? 1 : 0),
601 using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
604 using Mask =
typename UnderlyingIterator::Mask;
613 typename UnderlyingIterator::Params params_;
622 Params(Layout
const &layout): params_(layout::PitchLinear(layout.stride(0))) {
642 Params
const ¶ms,
651 layout::PitchLinearCoord(extent.column(), extent.row()),
653 layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
659 Params
const ¶ms,
668 iterator_.add_pointer_offset(pointer_offset);
697 iterator_.clear_mask();
703 iterator_.enable_mask();
709 iterator_.set_mask(mask);
715 iterator_.get_mask(mask);
721 iterator_.load_with_pointer_offset(frag, pointer_offset);
727 load_with_pointer_offset(frag, 0);
733 iterator_.store_with_pointer_offset(frag, pointer_offset);
739 store_with_pointer_offset(frag, 0);
754 template <
typename Shape_,
typename Element_,
int AdvanceRank,
755 typename ThreadMap_,
int AccessSize,
int InterleavedK>
757 layout::ColumnMajorInterleaved<InterleavedK>,
758 AdvanceRank, ThreadMap_, AccessSize> {
761 AdvanceRank == 0 || AdvanceRank == 1,
762 "Specialization for pitch-linear iterator may along advance along the " 763 "contiguous(rank=0) or strided(rank=1) dimension.");
766 using Element = Element_;
767 static int const kInterleavedK = InterleavedK;
769 static int const kAdvanceRank = AdvanceRank;
770 using ThreadMap = ThreadMap_;
784 Shape::kColumn / kInterleavedK>,
791 using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
792 ThreadMap::kElementsPerAccess>;
795 using Mask =
typename UnderlyingIterator::Mask;
803 typename UnderlyingIterator::Params params_;
812 : params_(layout::PitchLinear(layout.stride(0))) {}
829 Params
const ¶ms,
838 : iterator_(params.params_, pointer,
839 layout::PitchLinearCoord(extent.row() * kInterleavedK,
840 extent.column() / kInterleavedK),
842 layout::PitchLinearCoord(
843 threadblock_offset.row() * kInterleavedK,
844 threadblock_offset.column() / kInterleavedK)) {}
849 Params
const ¶ms,
860 iterator_.add_pointer_offset(pointer_offset);
907 iterator_.load_with_pointer_offset(frag, pointer_offset);
917 iterator_.store_with_pointer_offset(frag, pointer_offset);
935 template <
typename Shape_,
typename Element_,
int AdvanceRank,
936 typename ThreadMap_,
int AccessSize,
int InterleavedK>
938 layout::RowMajorInterleaved<InterleavedK>,
939 AdvanceRank, ThreadMap_, AccessSize> {
942 AdvanceRank == 0 || AdvanceRank == 1,
943 "Specialization for pitch-linear iterator may along advance along the " 944 "contiguous(rank=0) or strided(rank=1) dimension.");
947 using Element = Element_;
948 static int const kInterleavedK = InterleavedK;
950 static int const kAdvanceRank = AdvanceRank;
951 using ThreadMap = ThreadMap_;
965 Shape::kRow / kInterleavedK>,
972 using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
973 ThreadMap::kElementsPerAccess>;
976 using Mask =
typename UnderlyingIterator::Mask;
984 typename UnderlyingIterator::Params params_;
993 : params_(layout::PitchLinear(layout.stride(0))) {}
1010 Params
const ¶ms,
1019 : iterator_(params.params_, pointer,
1020 layout::PitchLinearCoord(extent.column() * kInterleavedK,
1021 extent.row() / kInterleavedK),
1023 layout::PitchLinearCoord(
1024 threadblock_offset.column() * kInterleavedK,
1025 threadblock_offset.row() / kInterleavedK)) {}
1030 Params
const ¶ms,
1041 iterator_.add_pointer_offset(pointer_offset);
1088 iterator_.load_with_pointer_offset(frag, pointer_offset);
1098 iterator_.store_with_pointer_offset(frag, pointer_offset);
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:62
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:355
Definition: aligned_buffer.h:35
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
Architecture-specific operators on memory.
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:249
Mapping function for pitch-linear memory.
Definition: pitch_linear.h:163
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:352
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:387
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:154
Aligned array type.
Definition: array.h:511
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:246
Mapping function for column-major matrices.
Definition: layout/matrix.h:142
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:59
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694
int64_t LongIndex
Long index type used for offsets.
Definition: pitch_linear.h:175
Templates calculating the address and predicates to the load of tiles from pitch-linear rank=2 tensor...
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
int32_t Index
Index type used for coordinates.
Definition: pitch_linear.h:172
Mapping function for row-major matrices.
Definition: layout/matrix.h:50
Definition: layout/matrix.h:343
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:151
Definition: matrix_coord.h:39
Definition: layout/matrix.h:237