45 namespace threadblock {
56 template <
typename Shape_,
typename Element_,
int AdvanceRank,
57 typename ThreadMap_,
int Alignment>
60 layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
61 int(128 / sizeof(Element_))>,
62 AdvanceRank, ThreadMap_, Alignment> {
65 AdvanceRank == 0 || AdvanceRank == 1,
66 "Specialization for pitch-linear iterator may along advance along the " 67 "contiguous(rank=0) or strided(rank=1) dimension.");
73 int(128 /
sizeof(Element_))>;
74 static int const kAdvanceRank = AdvanceRank;
75 static int const kAlignment = Alignment;
89 static int const kAccessSizeInBits = 128;
92 ThreadMap::kElementsPerAccess ==
94 "This iterator requires a policy whose access size is 128bs");
97 static int const kPointerCount =
98 (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
102 using AccessType = Array<Element, Layout::kElementsPerAccess>;
119 int iteration_contiguous_;
122 int iteration_strided_;
130 : stride_(ref.stride(0) /
Layout::kElementsPerAccess),
133 ThreadMap::initial_offset(thread_id);
136 for (
int i = 0; i < Detail::kPointerCount; ++i) {
142 0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
146 ref.
data() + ref.
offset(thread_offset_in_threadblock_tile));
149 set_iteration_index(0);
155 iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
156 iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
162 byte_offset_ += pointer_offset *
sizeof(
Element);
168 AccessType *access_ptr = pointer_[iteration_strided_ & 1];
169 int stride_idx = (iteration_strided_ & ~1);
171 int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
172 iteration_contiguous_ * ThreadMap::Delta::kContiguous /
173 ThreadMap::kElementsPerAccess;
175 char *access_byte_ptr =
176 reinterpret_cast<char *
>(access_ptr + access_offset);
177 return reinterpret_cast<AccessType *
>(access_byte_ptr + byte_offset_);
183 ++iteration_contiguous_;
185 if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
190 iteration_contiguous_ = 0;
191 ++iteration_strided_;
193 if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
199 iteration_strided_ = 0;
216 add_pointer_offset(coord.contiguous() * Shape::kContiguous +
217 coord.strided() * Shape::kStrided * stride_ *
218 Layout::kElementsPerAccess);
231 template <
typename Shape_,
typename Element_,
int AdvanceRank,
232 typename ThreadMap_,
int Alignment>
235 layout::ColumnMajorTensorOpMultiplicandCongruous<
236 sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
237 AdvanceRank, ThreadMap_, Alignment> {
240 AdvanceRank == 0 || AdvanceRank == 1,
241 "Specialization for column-major iterator may along advance along the " 242 "columns(rank=0) or rows(rank=1) dimension.");
248 static int const kAdvanceRank = AdvanceRank;
249 static int const kAlignment = Alignment;
263 int(128 /
sizeof(Element_))>,
264 (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
278 : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
287 iterator_.add_pointer_offset(pointer_offset);
293 return reinterpret_cast<AccessType *
>(iterator_.get());
299 iterator_.add_tile_offset({coord.row(), coord.column()});
328 template <
typename Shape_,
typename Element_,
int AdvanceRank,
329 typename ThreadMap_,
int Alignment>
332 layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
333 int(128 / sizeof(Element_))>,
334 AdvanceRank, ThreadMap_, Alignment> {
337 AdvanceRank == 0 || AdvanceRank == 1,
338 "Specialization for row-major iterator may along advance along the " 339 "columns(rank=0) or rows(rank=1) dimension.");
342 using Element = Element_;
345 static int const kAdvanceRank = AdvanceRank;
346 static int const kAlignment = Alignment;
359 layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
360 int(128 /
sizeof(Element_))>,
361 (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
375 : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
384 iterator_.add_pointer_offset(pointer_offset);
390 return reinterpret_cast<AccessType *
>(iterator_.get());
396 iterator_.add_tile_offset({coord.column(), coord.row()});
425 template <
typename Shape_,
typename Element_,
int AdvanceRank,
426 typename ThreadMap_,
int Alignment,
int Crosswise>
428 layout::TensorOpMultiplicandCrosswise<
429 sizeof_bits<Element_>::value, Crosswise>,
430 AdvanceRank, ThreadMap_, Alignment> {
433 AdvanceRank == 0 || AdvanceRank == 1,
434 "Specialization for pitch-linear iterator may along advance along the " 435 "contiguous(rank=0) or strided(rank=1) dimension.");
438 using Element = Element_;
442 static int const kAdvanceRank = AdvanceRank;
443 static int const kAlignment = Alignment;
444 static int const kCrosswise = Crosswise;
458 static int const kAccessSizeInBits = 128;
461 ThreadMap::kElementsPerAccess ==
463 "This iterator requires a policy whose access size is 128bs");
469 static int const kPointerCount =
470 (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
474 using AccessType = Array<Element, Layout::kElementsPerAccess>;
488 int sections_per_stage_;
500 int iteration_contiguous_;
503 int iteration_strided_;
511 : sections_(ref.stride(0) / kCrosswise),
512 sections_per_stage_(
Shape::kContiguous / kCrosswise),
514 stride_(ref.stride(0) *
Layout::kFactor /
Layout::kElementsPerAccess),
517 ThreadMap::initial_offset(thread_id);
520 for (
int i = 0; i < Detail::kPointerCount; ++i) {
526 0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
529 ref.
offset(thread_offset_in_threadblock_tile) /
530 Layout::kElementsPerAccess;
533 set_iteration_index(0);
539 iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
540 iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
552 AccessType *access_ptr = pointer_[iteration_strided_ & 1];
553 int stride_idx = (iteration_strided_ & ~1);
556 stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
557 iteration_contiguous_ * ThreadMap::Delta::kContiguous /
558 ThreadMap::kElementsPerAccess;
559 char *access_byte_ptr =
560 reinterpret_cast<char *
>(access_ptr + access_offset);
561 return reinterpret_cast<AccessType *
>(access_byte_ptr + byte_offset_);
567 ++iteration_contiguous_;
569 if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
574 iteration_contiguous_ = 0;
575 ++iteration_strided_;
577 if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
583 iteration_strided_ = 0;
600 add_pointer_offset(coord.contiguous() * sections_per_stage_ * stride_ *
601 ThreadMap::kElementsPerAccess / sections_ +
602 coord.strided() * Shape::kStrided * stride_ *
603 Layout::kElementsPerAccess);
616 template <
typename Shape_,
typename Element_,
int AdvanceRank,
617 typename ThreadMap_,
int Alignment,
int Crosswise>
620 layout::ColumnMajorTensorOpMultiplicandCrosswise<
621 sizeof_bits<Element_>::value, Crosswise>,
622 AdvanceRank, ThreadMap_, Alignment> {
625 AdvanceRank == 0 || AdvanceRank == 1,
626 "Specialization for column-major iterator may along advance along the " 627 "columns(rank=0) or rows(rank=1) dimension.");
630 using Element = Element_;
633 static int const kAdvanceRank = AdvanceRank;
634 static int const kAlignment = Alignment;
646 layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
649 (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
663 : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
672 iterator_.add_pointer_offset(pointer_offset);
678 return reinterpret_cast<AccessType *
>(iterator_.get());
684 iterator_.add_tile_offset({coord.row(), coord.column()});
713 template <
typename Shape_,
typename Element_,
int AdvanceRank,
714 typename ThreadMap_,
int Alignment,
int Crosswise>
716 layout::RowMajorTensorOpMultiplicandCrosswise<
717 sizeof_bits<Element_>::value, Crosswise>,
718 AdvanceRank, ThreadMap_, Alignment> {
721 AdvanceRank == 0 || AdvanceRank == 1,
722 "Specialization for row-major iterator may along advance along the " 723 "columns(rank=0) or rows(rank=1) dimension.");
726 using Element = Element_;
729 static int const kAdvanceRank = AdvanceRank;
730 static int const kAlignment = Alignment;
742 layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
745 (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
759 : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
768 iterator_.add_pointer_offset(pointer_offset);
774 return reinterpret_cast<AccessType *
>(iterator_.get());
780 iterator_.add_tile_offset({coord.column(), coord.row()});
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:434
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:640
Definition: aligned_buffer.h:35
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
Defines a structure containing strides, bounds, and a pointer to tensor data.
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:431
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:536
Definition: tensor_op_multiplicand_sm75.h:734
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:539
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:221
Definition: tensor_op_multiplicand_sm75.h:422
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:843
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:742
Definition: tensor_op_multiplicand_sm75.h:835
Definition: tensor_op_multiplicand_sm75.h:213
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:224
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:846
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object's stride vector.
Definition: tensor_ref.h:277
Defines a Shape template for matrix tiles.
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:745
Defines the size of an element in bits.
Definition: numeric_types.h:42
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:643
Templates implementing the address computation of storing of tiles from pitch-linear rank=2 tensors...
Defines a canonical coordinate for rank=2 matrices offering named indices.
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
Definition: tensor_op_multiplicand_sm75.h:632
Basic include for CUTLASS.
Definition: matrix_coord.h:39
Definition: tensor_op_multiplicand_sm75.h:527