50 namespace threadblock {
71 layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
78 "Specialization for pitch-linear iterator may along advance along the " 79 "contiguous(rank=0) or strided(rank=1) dimension.");
84 static int const kAdvanceRank = AdvanceRank;
98 static int const kAccessSizeInBits = 128;
102 "This iterator requires a policy whose access size is 128bs");
105 static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
112 using AccessType = Array<Element, Layout::kElementsPerAccess>;
117 using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
129 AccessType * pointer_[Detail::kPointerCount];
141 ): stride_(ref.stride(0) /
Layout::kElementsPerAccess), byte_offset_(0) {
146 for (
int i = 0; i < Detail::kPointerCount; ++i) {
154 pointer_[i] =
reinterpret_cast<AccessType *
>(ref.
data() + ref.
offset(thread_offset_in_threadblock_tile));
162 byte_offset_ += pointer_offset *
sizeof(
Element);
169 add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));
188 coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
189 coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
197 AccessType *frag_ptr =
reinterpret_cast<AccessType *
>(&frag);
199 Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
202 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
204 AccessType *access_ptr = pointer_[s & 1];
205 int stride_idx = (s & ~1);
208 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
210 int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
211 c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
214 int access_idx = c + s * ThreadMap::Iterations::kContiguous;
216 char const *access_byte_ptr =
reinterpret_cast<char const *
>(access_ptr + access_offset);
218 frag_ptr[access_idx] = *
reinterpret_cast<AccessType
const *
>(access_byte_ptr + byte_offset_);
226 load_with_pointer_offset(frag, 0);
233 Index pointer_offset) {
235 AccessType
const *frag_ptr =
reinterpret_cast<AccessType
const *
>(&frag);
237 Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
240 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
242 AccessType *access_ptr = pointer_[s & 1];
243 int stride_idx = (s & ~1);
246 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
248 int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
249 c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
252 int access_idx = c + s * ThreadMap::Iterations::kContiguous;
254 char *access_byte_ptr =
reinterpret_cast<char *
>(access_ptr + access_offset);
256 *
reinterpret_cast<AccessType *
>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
264 store_with_pointer_offset(frag, 0);
287 layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
294 "Specialization for column-major iterator may along advance along the " 295 "columns(rank=0) or rows(rank=1) dimension.");
300 static int const kAdvanceRank = AdvanceRank;
315 (kAdvanceRank == 0 ? 0 : 1),
321 using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
335 ): iterator_({ref.
data(), ref.
stride()}, thread_id) {
342 iterator_.add_pointer_offset(pointer_offset);
348 iterator_.add_tile_offset({coord.row(), coord.column()});
372 iterator_.load_with_pointer_offset(frag, pointer_offset);
378 load_with_pointer_offset(frag, 0);
385 Index pointer_offset) {
387 iterator_.store_with_pointer_offset(frag, pointer_offset);
393 store_with_pointer_offset(frag, 0);
417 layout::RowMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
424 "Specialization for row-major iterator may along advance along the " 425 "columns(rank=0) or rows(rank=1) dimension.");
430 static int const kAdvanceRank = AdvanceRank;
445 (kAdvanceRank == 0 ? 1 : 0),
451 using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
465 ): iterator_({ref.
data(), ref.
stride()}, thread_id) {
472 iterator_.add_pointer_offset(pointer_offset);
478 iterator_.add_tile_offset({coord.column(), coord.row()});
502 iterator_.load_with_pointer_offset(frag, pointer_offset);
508 load_with_pointer_offset(frag, 0);
515 Index pointer_offset) {
517 iterator_.store_with_pointer_offset(frag, pointer_offset);
523 store_with_pointer_offset(frag, 0);
543 layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
550 "Specialization for pitch-linear iterator may along advance along the " 551 "contiguous(rank=0) or strided(rank=1) dimension.");
556 static int const kAdvanceRank = AdvanceRank;
570 static int const kAccessSizeInBits = 128;
574 "This iterator requires a policy whose access size is 128bs");
577 static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
584 using AccessType = Array<Element, Layout::kElementsPerAccess>;
589 using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
601 AccessType * pointer_[Detail::kPointerCount];
613 ): stride_(ref.stride(0) /
Layout::kElementsPerAccess), byte_offset_(0) {
618 for (
int i = 0; i < Detail::kPointerCount; ++i) {
626 pointer_[i] =
reinterpret_cast<AccessType *
>(ref.
data() + ref.
offset(thread_offset_in_threadblock_tile));
634 byte_offset_ += pointer_offset *
sizeof(
Element);
641 add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));
660 coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
661 coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
669 AccessType *frag_ptr =
reinterpret_cast<AccessType *
>(&frag);
671 Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
674 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
676 AccessType *access_ptr = pointer_[s & 1];
677 int stride_idx = (s & ~1);
680 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
682 int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
683 c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
686 int access_idx = c + s * ThreadMap::Iterations::kContiguous;
688 char const *access_byte_ptr =
reinterpret_cast<char const *
>(access_ptr + access_offset);
690 frag_ptr[access_idx] = *
reinterpret_cast<AccessType
const *
>(access_byte_ptr + byte_offset_);
698 load_with_pointer_offset(frag, 0);
705 Index pointer_offset) {
707 AccessType
const *frag_ptr =
reinterpret_cast<AccessType
const *
>(&frag);
709 Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
712 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
714 AccessType *access_ptr = pointer_[s & 1];
715 int stride_idx = (s & ~1);
718 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
720 int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
721 c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
724 int access_idx = c + s * ThreadMap::Iterations::kContiguous;
726 char *access_byte_ptr =
reinterpret_cast<char *
>(access_ptr + access_offset);
728 *
reinterpret_cast<AccessType *
>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
736 store_with_pointer_offset(frag, 0);
759 layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
766 "Specialization for column-major iterator may along advance along the " 767 "columns(rank=0) or rows(rank=1) dimension.");
772 static int const kAdvanceRank = AdvanceRank;
787 (kAdvanceRank == 0 ? 0 : 1),
793 using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
807 ): iterator_({ref.
data(), ref.
stride()}, thread_id) {
814 iterator_.add_pointer_offset(pointer_offset);
820 iterator_.add_tile_offset({coord.row(), coord.column()});
844 iterator_.load_with_pointer_offset(frag, pointer_offset);
850 load_with_pointer_offset(frag, 0);
857 Index pointer_offset) {
859 iterator_.store_with_pointer_offset(frag, pointer_offset);
865 store_with_pointer_offset(frag, 0);
889 layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
896 "Specialization for row-major iterator may along advance along the " 897 "columns(rank=0) or rows(rank=1) dimension.");
902 static int const kAdvanceRank = AdvanceRank;
917 (kAdvanceRank == 0 ? 1 : 0),
923 using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
937 ): iterator_({ref.
data(), ref.
stride()}, thread_id) {
944 iterator_.add_pointer_offset(pointer_offset);
950 iterator_.add_tile_offset({coord.column(), coord.row()});
974 iterator_.load_with_pointer_offset(frag, pointer_offset);
980 load_with_pointer_offset(frag, 0);
987 Index pointer_offset) {
989 iterator_.store_with_pointer_offset(frag, pointer_offset);
995 store_with_pointer_offset(frag, 0);
1015 typename ThreadMap_,
1020 layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
1021 Shape_::kContiguous>,
1022 AdvanceRank, ThreadMap_, Alignment> {
1026 AdvanceRank == 0 || AdvanceRank == 1,
1027 "Specialization for pitch-linear iterator may along advance along the " 1028 "contiguous(rank=0) or strided(rank=1) dimension.");
1034 Shape::kContiguous>;
1035 static int const kAdvanceRank = AdvanceRank;
1049 static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
1052 static int const kIterarionsPerAccess =
1053 ThreadMap::kElementsPerAccess / Layout::kElementsPerAccess;
1056 static int const kContiguousElementsPerLine = 4;
1061 using AccessType = Array<Element, Layout::kElementsPerAccess>;
1066 Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
1079 AccessType *pointer_[Detail::kPointerCount];
1091 : line_size(ref.stride(0) * Detail::kContiguousElementsPerLine /
Layout::kElementsPerAccess),
1095 ThreadMap::initial_offset(thread_id);
1098 for (
int i = 0; i < Detail::kPointerCount; ++i) {
1102 thread_offset_base +
1104 0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
1107 pointer_[i] =
reinterpret_cast<AccessType *
>(
1108 ref.
data() + ref.
offset(thread_offset_in_threadblock_tile));
1115 byte_offset_ += pointer_offset *
sizeof(
Element);
1123 add_pointer_offset(Shape::kContiguous * line_size);
1139 add_pointer_offset((coord.contiguous() * (Shape::kContiguous / Layout::kElementsPerAccess) *
1140 line_size + coord.strided() * Shape::kStrided) *
1141 Layout::kElementsPerAccess);
1147 AccessType *frag_ptr =
reinterpret_cast<AccessType *
>(&frag);
1149 Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
1152 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
1153 AccessType *access_ptr = pointer_[(s & 1) ^ (s / 2)];
1155 access_ptr += 16 * (s / 2);
1158 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
1161 for(
int i = 0; i < Detail::kIterarionsPerAccess; ++i) {
1164 c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size +
1165 vec_pointer_offset + i * line_size;
1167 int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
1168 Detail::kIterarionsPerAccess + i;
1170 char const *access_byte_ptr =
reinterpret_cast<char const*
>(access_ptr + access_offset);
1172 frag_ptr[access_idx] = *
reinterpret_cast<AccessType
const *
>(
1173 access_byte_ptr + byte_offset_);
1186 AccessType
const *frag_ptr =
reinterpret_cast<AccessType
const *
>(&frag);
1188 Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
1191 for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
1192 AccessType *access_ptr = pointer_[(s & 1) ^ ((s >> 1) & 1)];
1194 access_ptr += 16 * (s / 2);
1197 for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
1199 for(
int i = 0; i < Detail::kIterarionsPerAccess; ++i) {
1202 c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size +
1203 vec_pointer_offset + i * line_size;
1205 int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
1206 Detail::kIterarionsPerAccess + i;
1208 char *access_byte_ptr =
reinterpret_cast<char *
>(access_ptr + access_offset);
1210 *
reinterpret_cast<AccessType *
>(access_byte_ptr + byte_offset_) =
1211 frag_ptr[access_idx];
1235 typename ThreadMap_,
1239 layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
1240 sizeof_bits<Element_>::value, Shape_::kRow>,
1241 AdvanceRank, ThreadMap_, Alignment> {
1244 AdvanceRank == 0 || AdvanceRank == 1,
1245 "Specialization for column-major iterator may along advance along the " 1246 "columns(rank=0) or rows(rank=1) dimension.");
1252 static int const kAdvanceRank = AdvanceRank;
1267 (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
1271 using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
1283 : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
1288 iterator_.add_pointer_offset(pointer_offset);
1294 iterator_.add_tile_offset({coord.row(), coord.column()});
1316 iterator_.load_with_pointer_offset(frag, pointer_offset);
1326 iterator_.store_with_pointer_offset(frag, pointer_offset);
1347 typename ThreadMap_,
1351 layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
1352 sizeof_bits<Element_>::value, Shape_::kColumn>,
1353 AdvanceRank, ThreadMap_, Alignment> {
1356 AdvanceRank == 0 || AdvanceRank == 1,
1357 "Specialization for row-major iterator may along advance along the " 1358 "columns(rank=0) or rows(rank=1) dimension.");
1364 static int const kAdvanceRank = AdvanceRank;
1365 static int const kAlignment = Alignment;
1380 (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
1384 using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
1396 : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
1401 iterator_.add_pointer_offset(pointer_offset);
1407 iterator_.add_tile_offset({coord.column(), coord.row()});
1429 iterator_.load_with_pointer_offset(frag, pointer_offset);
1439 iterator_.store_with_pointer_offset(frag, pointer_offset);
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:630
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:537
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:69
Definition: aligned_buffer.h:35
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
Defines a structure containing strides, bounds, and a pointer to tensor data.
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:859
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:540
Definition: tensor_op_multiplicand_sm70.h:848
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:642
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:200
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object's stride vector.
Definition: tensor_ref.h:277
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:528
Defines the size of an element in bits.
Definition: numeric_types.h:42
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:405
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:203
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:951
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:293
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:305
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:397
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:302
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:72
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:741
Definition: tensor_op_multiplicand_sm70.h:733
Definition: tensor_op_multiplicand_sm70.h:943
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:639
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:744
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:954
Defines a canonical coordinate for rank=2 matrices offering named indices.
Templates implementing storing of tiles from pitch-linear rank=2 tensors.
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:856
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:191
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:408
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:60
Basic include for CUTLASS.
Definition: matrix_coord.h:39