38 namespace threadblock {
49 template <
typename Shape_,
typename Element_,
int AdvanceRank,
50 typename ThreadMap_,
int Alignment>
53 layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
54 int(128 / sizeof(Element_))>,
55 AdvanceRank, ThreadMap_, Alignment> {
59 "Specialization for pitch-linear iterator may along advance along the " 60 "contiguous(rank=0) or strided(rank=1) dimension.");
67 static int const kAdvanceRank = AdvanceRank;
68 static int const kAlignment = Alignment;
82 static int const kAccessSizeInBits = 128;
86 "This iterator requires a policy whose access size is 128bs");
92 using AccessType = Array<Element, Layout::kElementsPerAccess>;
97 using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
119 : address_iterator_(ref, thread_id) {}
124 address_iterator_.add_pointer_offset(pointer_offset);
130 address_iterator_.add_tile_offset({0, 1});
146 address_iterator_.add_tile_offset(coord);
152 address_iterator_.set_iteration_index(0);
153 AccessType *frag_ptr =
reinterpret_cast<AccessType *
>(&frag);
156 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
158 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
159 int access_idx = c + s * ThreadMap::Iterations::kContiguous;
160 frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
169 load_with_pointer_offset(frag, 0);
175 address_iterator_.set_iteration_index(0);
176 AccessType
const *frag_ptr =
reinterpret_cast<AccessType
const *
>(&frag);
179 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
181 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
182 int access_idx = c + s * ThreadMap::Iterations::kContiguous;
183 *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
192 store_with_pointer_offset(frag, 0);
205 template <
typename Shape_,
typename Element_,
int AdvanceRank,
206 typename ThreadMap_,
int Alignment>
209 layout::ColumnMajorTensorOpMultiplicandCongruous<
210 sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
211 AdvanceRank, ThreadMap_, Alignment> {
215 "Specialization for column-major iterator may along advance along the " 216 "columns(rank=0) or rows(rank=1) dimension.");
222 static int const kAdvanceRank = AdvanceRank;
223 static int const kAlignment = Alignment;
237 int(128 /
sizeof(Element))>,
238 (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
243 using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
257 ): iterator_({ref.
data(), ref.
stride()}, thread_id) {
264 iterator_.add_pointer_offset(pointer_offset);
270 iterator_.add_tile_offset({coord.row(), coord.column()});
292 iterator_.load_with_pointer_offset(frag, pointer_offset);
298 load_with_pointer_offset(frag, 0);
305 Index pointer_offset) {
307 iterator_.store_with_pointer_offset(frag, pointer_offset);
313 store_with_pointer_offset(frag, 0);
326 template <
typename Shape_,
typename Element_,
int AdvanceRank,
327 typename ThreadMap_,
int Alignment>
330 layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
331 int(128 / sizeof(Element_))>,
332 AdvanceRank, ThreadMap_, Alignment> {
336 "Specialization for row-major iterator may along advance along the " 337 "columns(rank=0) or rows(rank=1) dimension.");
340 using Element = Element_;
343 static int const kAdvanceRank = AdvanceRank;
344 static int const kAlignment = Alignment;
357 layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
358 int(128 /
sizeof(Element))>,
359 (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
364 using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
378 ): iterator_({ref.
data(), ref.
stride()}, thread_id) {
385 iterator_.add_pointer_offset(pointer_offset);
391 iterator_.add_tile_offset({coord.column(), coord.row()});
415 iterator_.load_with_pointer_offset(frag, pointer_offset);
421 load_with_pointer_offset(frag, 0);
428 Index pointer_offset) {
430 iterator_.store_with_pointer_offset(frag, pointer_offset);
436 store_with_pointer_offset(frag, 0);
449 template <
typename Shape_,
typename Element_,
int AdvanceRank,
450 typename ThreadMap_,
int Alignment,
int Crosswise>
452 layout::TensorOpMultiplicandCrosswise<
453 sizeof_bits<Element_>::value, Crosswise>,
454 AdvanceRank, ThreadMap_, Alignment> {
457 AdvanceRank == 0 || AdvanceRank == 1,
458 "Specialization for pitch-linear iterator may along advance along the " 459 "contiguous(rank=0) or strided(rank=1) dimension.");
462 using Element = Element_;
467 static int const kAdvanceRank = AdvanceRank;
468 static int const kAlignment = Alignment;
482 static int const kAccessSizeInBits = 128;
486 "This iterator requires a policy whose access size is 128bs");
491 using AccessType = Array<Element, Layout::kElementsPerAccess>;
496 Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
516 : address_iterator_(ref, thread_id) {}
521 address_iterator_.add_pointer_offset(pointer_offset);
527 address_iterator_.add_tile_offset({1, 0});
543 address_iterator_.add_tile_offset(coord);
549 address_iterator_.set_iteration_index(0);
550 AccessType *frag_ptr =
reinterpret_cast<AccessType *
>(&frag);
553 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
555 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
556 int access_idx = c + s * ThreadMap::Iterations::kContiguous;
557 frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
570 address_iterator_.set_iteration_index(0);
571 AccessType
const *frag_ptr =
reinterpret_cast<AccessType
const *
>(&frag);
574 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
576 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
577 int access_idx = c + s * ThreadMap::Iterations::kContiguous;
578 *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
598 template <
typename Shape_,
typename Element_,
int AdvanceRank,
599 typename ThreadMap_,
int Alignment,
int Crosswise>
601 layout::ColumnMajorTensorOpMultiplicandCrosswise<
602 sizeof_bits<Element_>::value, Crosswise>,
603 AdvanceRank, ThreadMap_, Alignment> {
606 AdvanceRank == 0 || AdvanceRank == 1,
607 "Specialization for column-major iterator may along advance along the " 608 "columns(rank=0) or rows(rank=1) dimension.");
611 using Element = Element_;
614 static int const kAdvanceRank = AdvanceRank;
615 static int const kAlignment = Alignment;
627 layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
630 (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
634 using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
646 : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
651 iterator_.add_pointer_offset(pointer_offset);
657 iterator_.add_tile_offset({coord.row(), coord.column()});
679 iterator_.load_with_pointer_offset(frag, pointer_offset);
689 iterator_.store_with_pointer_offset(frag, pointer_offset);
706 template <
typename Shape_,
typename Element_,
int AdvanceRank,
707 typename ThreadMap_,
int Alignment,
int Crosswise>
709 layout::RowMajorTensorOpMultiplicandCrosswise<
710 sizeof_bits<Element_>::value, Crosswise>,
711 AdvanceRank, ThreadMap_, Alignment> {
714 AdvanceRank == 0 || AdvanceRank == 1,
715 "Specialization for row-major iterator may along advance along the " 716 "columns(rank=0) or rows(rank=1) dimension.");
719 using Element = Element_;
722 static int const kAdvanceRank = AdvanceRank;
723 static int const kAlignment = Alignment;
735 layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
738 (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
742 using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
754 : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
759 iterator_.add_pointer_offset(pointer_offset);
765 iterator_.add_tile_offset({coord.column(), coord.row()});
787 iterator_.load_with_pointer_offset(frag, pointer_offset);
797 iterator_.store_with_pointer_offset(frag, pointer_offset);
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:434
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:640
Definition: aligned_buffer.h:35
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:431
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:536
Definition: tensor_op_multiplicand_sm75.h:734
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:539
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:221
Definition: tensor_op_multiplicand_sm75.h:422
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:843
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:742
Definition: tensor_op_multiplicand_sm75.h:835
Definition: tensor_op_multiplicand_sm75.h:213
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:224
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:846
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object's stride vector.
Definition: tensor_ref.h:277
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:745
Defines the size of an element in bits.
Definition: numeric_types.h:42
Templates implementing computing the addresses of storing of tiles from pitch-linear rank=2 tensors...
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:643
Templates implementing storing of tiles from pitch-linear rank=2 tensors.
Definition: tensor_op_multiplicand_sm75.h:632
Definition: matrix_coord.h:39
Definition: tensor_op_multiplicand_sm75.h:527