44 namespace threadblock {
131 bool Transpose =
false 144 template <
typename Shape_,
typename Element_,
int AdvanceRank,
typename ThreadMap_,
bool Transpose_>
148 AdvanceRank == 0 || AdvanceRank == 1,
149 "Specialization for pitch-linear iterator may along advance along the " 150 "contiguous(rank=0) or strided(rank=1) dimension.");
153 using Element = Element_;
155 static int const kAdvanceRank = AdvanceRank;
156 using ThreadMap = ThreadMap_;
170 struct alignas((ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value /
174 Array<Element, ThreadMap::kElementsPerAccess> storage;
176 static int const kElements = ThreadMap::kElementsPerAccess;
181 static bool const transpose = Transpose_;
186 ThreadMap, AccessType>;
189 using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
190 ThreadMap::ThreadAccessShape::kCount>;
193 using Mask =
typename TileAccessIterator::Mask;
202 typename TileAccessIterator::Params params_;
207 Params(Layout
const &layout) : params_(layout) { }
215 using BytePointer =
char *;
231 Params
const ¶ms,
240 : address_iterator_(params.params_, pointer, extent, thread_id,
241 threadblock_offset) {}
246 Params
const ¶ms,
251 : PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id,
256 void add_pointer_offset(
LongIndex pointer_offset) {
257 address_iterator_.add_pointer_offset(pointer_offset);
269 address_iterator_.add_tile_offset({0, 1});
271 address_iterator_.add_tile_offset({1, 0});
291 void clear_mask() { address_iterator_.clear_mask(); }
295 void enable_mask() { address_iterator_.enable_mask(); }
299 void set_mask(
Mask const &mask) { address_iterator_.set_mask(mask); }
303 void get_mask(
Mask &mask) { address_iterator_.get_mask(mask); }
307 void load_with_pointer_offset(
Fragment &frag,
Index pointer_offset) {
309 AccessType *frag_ptr =
reinterpret_cast<AccessType *
>(&frag);
312 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
314 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
316 for (
int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++){
318 int access_idx = ts + c * ThreadMap::ThreadAccessShape::kStrided + \
319 s * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
321 address_iterator_.set_iteration_index(access_idx);
322 if (address_iterator_.valid()) {
324 frag_ptr[access_idx] =
325 *(address_iterator_.get() + pointer_offset);
335 t.transform(frag, frag);
341 void load(
Fragment &frag) { load_with_pointer_offset(frag, 0); }
345 void store_with_pointer_offset(
Fragment const &frag,
Index pointer_offset) {
347 AccessType
const *frag_ptr =
reinterpret_cast<AccessType
const *
>(&frag);
350 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
352 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
354 for (
int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++){
356 int access_idx = ts + c * ThreadMap::ThreadAccessShape::kStrided + \
357 s * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
359 address_iterator_.set_iteration_index(access_idx);
360 if (address_iterator_.valid()) {
361 *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
371 void store(
Fragment const &frag) { store_with_pointer_offset(frag, 0); }
394 "Specialization for pitch-linear iterator may along advance along the " 395 "contiguous(rank=0) or strided(rank=1) dimension.");
398 using Element = Element_;
400 static int const kAdvanceRank = AdvanceRank;
401 using ThreadMap = ThreadMap_;
402 static bool const Transpose = Transpose_;
418 (kAdvanceRank == 0 ? 0 : 1),
423 using AccessType =
typename UnderlyingIterator::AccessType;
426 using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
429 using Mask =
typename UnderlyingIterator::Mask;
438 typename UnderlyingIterator::Params params_;
447 Params(Layout
const &layout): params_(layout::PitchLinear(layout.stride(0))) {
467 Params
const ¶ms,
476 layout::PitchLinearCoord(extent.row(), extent.column()),
478 layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
484 Params
const ¶ms,
488 ): PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id,
make_Coord(0, 0)) { }
492 void add_pointer_offset(
LongIndex pointer_offset) {
493 iterator_.add_pointer_offset(pointer_offset);
522 iterator_.clear_mask();
528 iterator_.enable_mask();
533 void set_mask(
Mask const &mask) {
534 iterator_.set_mask(mask);
539 void get_mask(
Mask &mask) {
540 iterator_.get_mask(mask);
545 void load_with_pointer_offset(
Fragment &frag,
Index pointer_offset) {
546 iterator_.load_with_pointer_offset(frag, pointer_offset);
552 load_with_pointer_offset(frag, 0);
557 void store_with_pointer_offset(
Fragment const &frag,
Index pointer_offset) {
558 iterator_.store_with_pointer_offset(frag, pointer_offset);
564 store_with_pointer_offset(frag, 0);
588 "Specialization for pitch-linear iterator may along advance along the " 589 "contiguous(rank=0) or strided(rank=1) dimension.");
592 using Element = Element_;
594 static int const kAdvanceRank = AdvanceRank;
595 using ThreadMap = ThreadMap_;
596 static bool const Transpose = Transpose_;
612 (kAdvanceRank == 0 ? 1 : 0),
617 using AccessType =
typename UnderlyingIterator::AccessType;
620 using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
623 using Mask =
typename UnderlyingIterator::Mask;
632 typename UnderlyingIterator::Params params_;
641 Params(Layout
const &layout): params_(layout::PitchLinear(layout.stride(0))) {
661 Params
const ¶ms,
670 layout::PitchLinearCoord(extent.column(), extent.row()),
672 layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
678 Params
const ¶ms,
682 ): PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id,
make_Coord(0, 0)) { }
686 void add_pointer_offset(
LongIndex pointer_offset) {
687 iterator_.add_pointer_offset(pointer_offset);
716 iterator_.clear_mask();
722 iterator_.enable_mask();
727 void set_mask(
Mask const &mask) {
728 iterator_.set_mask(mask);
733 void get_mask(
Mask &mask) {
734 iterator_.get_mask(mask);
739 void load_with_pointer_offset(
Fragment &frag,
Index pointer_offset) {
740 iterator_.load_with_pointer_offset(frag, pointer_offset);
746 load_with_pointer_offset(frag, 0);
751 void store_with_pointer_offset(
Fragment const &frag,
Index pointer_offset) {
752 iterator_.store_with_pointer_offset(frag, pointer_offset);
758 store_with_pointer_offset(frag, 0);
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:62
Definition: aligned_buffer.h:35
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
Basic copy routines for tensor views.
Mapping function for pitch-linear memory.
Definition: pitch_linear.h:163
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:387
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:154
Mapping function for column-major matrices.
Definition: layout/matrix.h:142
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:59
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694
int64_t LongIndex
Long index type used for offsets.
Definition: pitch_linear.h:175
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
int32_t Index
Index type used for coordinates.
Definition: pitch_linear.h:172
Mapping function for row-major matrices.
Definition: layout/matrix.h:50
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:151
Templates calculating the address and predicates to the load of tiles from pitch-linear rank=2 tensor...
Definition: matrix_coord.h:39