67 typename InstructionShape_,
93 typename InstructionShape_,
100 Shape_, Operand_, Element_,
103 InstructionShape_, OpDelta_, 32, PartitionsK_> {
110 static Operand
const kOperand = Operand_;
113 "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
126 static int const kOpDelta = OpDelta_;
129 static int const kThreads = 32;
132 static int const kPartitionsK = PartitionsK_;
149 !(Shape::kContiguous % InstructionShape::kContiguous),
150 "Shape of warp-level Mma must be divisible by operator shape.");
153 static int const kLdsmOpOuter = Layout::kElementsPerAccess;
154 static int const kLdsmOpInner = 8;
157 "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
160 "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
163 static int const LdsmShapeStrided =
164 InstructionShape::kStrided / kLdsmOpInner;
165 static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
171 Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
175 static int const kGroupsPerTile =
176 Shape::kStrided / InstructionShape::kStrided;
183 "Alternative arrangements not supported at present.");
186 static int const kPointerCount =
187 Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous;
190 using AccessType = Array<Element, Layout::kElementsPerAccess>;
210 AccessType
const *pointer_[kPointerCount];
227 stride_(ref.stride(0) /
Layout::kElementsPerAccess), byte_offset_(0),
230 int quad_pair = (lane_id >> 3);
231 int lane_in_quad = (lane_id & 3);
232 int lane_in_quad_pair = (lane_id & 7);
234 for (
int i = 0; i < kPointerCount; ++i) {
235 int partition_contiguous_idx = -1;
236 int access_contiguous_idx = -1;
237 int access_strided_idx = -1;
239 if (Policy::LdsmShape::kContiguous == 4) {
240 partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ i);
241 access_contiguous_idx = (quad_pair ^ lane_in_quad);
242 access_strided_idx = lane_in_quad_pair;
244 int access_contiguous =
245 partition_contiguous_idx * Layout::PartitionShape::kContiguous +
246 access_contiguous_idx;
248 int access_strided = access_strided_idx;
250 pointer_[i] =
reinterpret_cast<AccessType
const *
>(ref.
data()) +
251 access_contiguous + access_strided * stride_;
259 byte_offset_ += offset *
sizeof(Element);
268 int contiguous_offset = tile_offset.contiguous();
269 if (Shape::kContiguous ==
270 Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
271 if (tile_offset.contiguous() % 2) {
273 for (
int i = 0; i < kPointerCount / 2; ++i) {
274 AccessType
const *tmp_pointer = pointer_[i];
275 pointer_[i] = pointer_[i + kPointerCount / 2];
276 pointer_[i + kPointerCount / 2] = tmp_pointer;
279 contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
282 int offset = (tile_offset.strided() * InstructionShape::kStrided) *
283 stride_ * Layout::kElementsPerAccess +
284 contiguous_offset * Shape::kContiguous;
286 add_pointer_offset(offset);
295 add_tile_offset({0, 1});
297 if (kPartitionsK > 1) {
300 if (k_group_idx_ == Policy::kGroupsPerTile) {
303 {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
313 byte_offset_ -= stride_ * InstructionShape::kStrided *
sizeof(Element) *
314 Layout::kElementsPerAccess;
322 add_tile_offset(tile_offset);
329 add_tile_offset(-tile_offset);
337 load_with_byte_offset(frag, 0);
346 Index byte_offset)
const {
348 Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
349 reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *
>(&frag);
352 for (
int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
355 for (
int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
357 int access_idx = c + s * Policy::LdsmIterations::kContiguous;
359 AccessType
const *source_ptr =
360 pointer_[c % kPointerCount] +
361 Layout::TileShape::kContiguous * (c / kPointerCount) +
362 Policy::LdsmShape::kStrided * s * stride_;
364 char const *source_byte_ptr =
reinterpret_cast<char const *
>(source_ptr) + byte_offset + byte_offset_;
366 cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
367 fetch_ptr[access_idx],
380 Index pointer_offset)
const {
381 load_with_byte_offset(frag, pointer_offset *
sizeof(Element));
391 load_with_byte_offset(frag, tile_offset, 0);
402 Index pointer_offset)
const {
403 load_with_byte_offset(frag, tile_offset, pointer_offset *
sizeof(Element));
414 Index byte_offset)
const {
415 Index pointer_offset =
416 tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess +
417 tile_offset.strided() * InstructionShape::kStrided * stride_;
419 byte_offset +=
sizeof(AccessType) * pointer_offset;
421 load_with_byte_offset(frag, byte_offset);
452 typename InstructionShape_,
459 Shape_, Operand_, Element_,
461 sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
462 InstructionShape_, OpDelta_, 32, PartitionsK_> {
469 static Operand
const kOperand = Operand_;
472 "MmaTensorOpMultiplicandIterator for ColumnMajor Congruous may " 473 "only be instantiated for A operand to warp-level Mma.");
486 static int const kOpDelta = OpDelta_;
489 static int const kThreads = 32;
507 int(128 /
sizeof(Element_))>,
509 InstructionShape::kColumn>,
510 kOpDelta, kThreads, PartitionsK_>;
519 using Fragment = Array<Element, Shape::kCount / kThreads>;
537 ): iterator_({ref.
data(), ref.
stride()}, lane_id) {
544 iterator_.add_pointer_offset(offset);
553 iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
579 add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
586 add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
594 iterator_.load(frag);
603 Index pointer_offset)
const {
604 iterator_.load_with_pointer_offset(frag, pointer_offset);
613 Index byte_offset)
const {
614 iterator_.load_with_byte_offset(frag, byte_offset);
635 Index pointer_offset)
const {
647 Index byte_offset)
const {
648 iterator_.load_with_byte_offset(
650 {tile_offset.contiguous(), tile_offset.strided()},
663 iterator_.set_kgroup_index(k_group);
683 typename InstructionShape_,
690 Shape_, Operand_, Element_,
692 sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
693 InstructionShape_, OpDelta_, 32, PartitionsK_> {
700 static Operand
const kOperand = Operand_;
703 "MmaTensorOpMultiplicandIterator for RowMajor Congruous may " 704 "only be instantiated for B operand to warp-level Mma.");
707 using Element = Element_;
717 static int const kOpDelta = OpDelta_;
720 static int const kThreads = 32;
737 layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
738 int(128 /
sizeof(Element_))>,
740 InstructionShape::kRow>,
741 kOpDelta, kThreads, PartitionsK_>;
750 using Fragment = Array<Element, Shape::kCount / kThreads>;
768 ): iterator_({ref.
data(), ref.
stride()}, lane_id) {
775 iterator_.add_pointer_offset(offset);
784 iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
810 add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
817 add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
825 iterator_.load(frag);
834 Index pointer_offset)
const {
835 iterator_.load_with_pointer_offset(frag, pointer_offset);
844 Index byte_offset)
const {
845 iterator_.load_with_byte_offset(frag, byte_offset);
866 Index pointer_offset)
const {
878 Index byte_offset)
const {
879 iterator_.load_with_byte_offset(
881 {tile_offset.strided(), tile_offset.contiguous()},
894 iterator_.set_kgroup_index(k_group);
915 typename InstructionShape_,
924 Shape_, Operand_, Element_,
927 InstructionShape_, OpDelta_, 32, PartitionsK_> {
933 static Operand
const kOperand = Operand_;
936 "MmaTensorOpMultiplicandIterator may only be instantiated for " 937 "A or B operands to warp-level Mma.");
940 using Element = Element_;
943 static int const kCrosswise = Crosswise;
954 static int const kOpDelta = OpDelta_;
957 static int const kThreads = 32;
960 static int const kPartitionsK = PartitionsK_;
977 !(Shape::kContiguous % InstructionShape::kContiguous),
978 "Shape of warp-level Mma must be divisible by operator shape.");
981 static int const kLdsmOpOuter = Layout::kElementsPerAccess;
982 static int const kLdsmOpInner = 8;
985 "Shape of warp-level mma must be divisible by LDSM's " 986 "fundamental tile size.");
989 "Shape of warp-level mma must be divisible by LDSM's " 990 "fundamental tile size.");
993 static int const LdsmShapeContiguous =
994 InstructionShape::kContiguous / kLdsmOpOuter;
995 static int const LdsmShapeStrided =
996 ((4 / LdsmShapeContiguous * kLdsmOpInner) > Shape::kStrided)
997 ? (Shape::kStrided / kLdsmOpInner)
998 : (4 / LdsmShapeContiguous);
1005 LdsmShape::kStrided>;
1008 static int const kGroupsPerTile = Layout::TileShape::kContiguous /
1009 Layout::kFactor / LdsmShape::kContiguous;
1015 "Alternative arrangements not supported at present.");
1018 using AccessType = Array<Element, Layout::kElementsPerAccess>;
1026 using Fragment = Array<Element, Shape::kCount / kThreads>;
1040 AccessType
const *pointer_;
1062 : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
1063 sections_(ref.stride(0) / kCrosswise),
1065 stride_(ref.stride(0) *
Layout::kFactor /
Layout::kElementsPerAccess),
1074 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 750)) 1075 lane_id = lane_id % (Policy::LdsmShape::kCount * Policy::kLdsmOpInner);
1078 int lane_in_pair = (lane_id & 1);
1079 int lane_in_quad = (lane_id & 3);
1080 int lane_in_quad_pair = (lane_id & 7);
1081 int lane_in_quad_quad = (lane_id & 15);
1083 int partition_contiguous_idx = -1;
1084 int access_contiguous_idx = -1;
1085 int access_strided_idx = -1;
1087 if (Layout::kFactor == 4) {
1090 int factor_in_partition =
1091 (Layout::PartitionShape::kContiguous * Layout::kFactor /
1092 Layout::TileShape::kContiguous);
1094 if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
1096 partition_contiguous_idx = lane_in_quad / factor_in_partition;
1097 access_contiguous_idx = ((lane_in_pair * factor_in_partition) ^
1098 (lane_in_quad_quad / Layout::kFactor));
1099 access_strided_idx = lane_id / Layout::kFactor;
1101 }
else if (Layout::kFactor == 2) {
1103 if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
1110 partition_contiguous_idx = (lane_id % Layout::kFactor);
1111 access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor);
1112 access_strided_idx = lane_id / Layout::kFactor;
1114 }
else if (Layout::kFactor == 1) {
1116 if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
1121 partition_contiguous_idx = (lane_in_quad_pair >> 2);
1122 access_contiguous_idx = lane_in_quad;
1123 access_strided_idx = lane_id;
1127 int access_contiguous =
1128 partition_contiguous_idx * Layout::PartitionShape::kContiguous +
1129 access_contiguous_idx;
1131 int access_strided = access_strided_idx;
1133 byte_offset_ = (access_contiguous + access_strided * stride_) *
1150 int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
1151 int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
1154 Layout::kElementsPerAccess / 8;
1156 tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor +
1157 whole_tiles * stride_ / sections_;
1171 if ((Policy::kGroupsPerTile / kPartitionsK) > 1) {
1172 int mask = ((Policy::kGroupsPerTile / kPartitionsK) == 8)
1174 : (((Policy::kGroupsPerTile / kPartitionsK) == 4) ? 1 : 0);
1176 if (((k_group_idx_ & mask) % 2) == 0)
1177 byte_offset_ ^= 1 * Policy::LdsmShape::kContiguous *
1179 Layout::kElementsPerAccess / 8;
1180 else if ((k_group_idx_ & mask) == 1)
1181 byte_offset_ ^= 3 * Policy::LdsmShape::kContiguous *
1183 Layout::kElementsPerAccess / 8;
1184 else if ((k_group_idx_ & mask) == 3)
1185 byte_offset_ ^= 7 * Policy::LdsmShape::kContiguous *
1187 Layout::kElementsPerAccess / 8;
1192 if (k_group_idx_ == (Policy::kGroupsPerTile / kPartitionsK)) {
1194 add_tile_offset({Policy::kGroupsPerTile, 0});
1209 add_tile_offset(tile_offset);
1218 add_tile_offset(-tile_offset);
1232 Index byte_offset)
const {
1233 Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
1234 reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *
>(&frag);
1237 for (
int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
1239 for (
int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
1240 int access_idx = c + s * Policy::LdsmIterations::kContiguous;
1242 AccessType
const *source_ptr =
1243 pointer_ + Policy::LdsmShape::kContiguous * c +
1244 Policy::kLdsmOpInner / Layout::kFactor *
1245 Policy::LdsmShape::kStrided * s * stride_;
1247 char const *source_byte_ptr =
1248 reinterpret_cast<char const *
>(source_ptr) + byte_offset +
1251 cutlass::arch::ldsm<layout::RowMajor, Policy::LdsmShape::kCount>(
1252 fetch_ptr[access_idx], source_byte_ptr);
1263 Index pointer_offset)
const {
1264 load_with_byte_offset(frag, pointer_offset *
sizeof(Element));
1274 load_with_byte_offset(frag, tile_offset, 0);
1285 Index pointer_offset)
const {
1286 load_with_byte_offset(frag, tile_offset, pointer_offset *
sizeof(Element));
1297 Index byte_offset)
const {
1298 Index pointer_offset = tile_offset.contiguous() *
1299 InstructionShape::kContiguous /
1300 Layout::kElementsPerAccess +
1301 tile_offset.strided() * Shape::kStrided * stride_;
1305 load_with_byte_offset(frag, byte_offset);
1317 k_group_idx_ = k_group % (Policy::kGroupsPerTile / kPartitionsK);
1338 typename InstructionShape_,
1347 Shape_, Operand_, Element_,
1349 sizeof_bits<Element_>::value, Crosswise>,
1350 InstructionShape_, OpDelta_, 32, PartitionsK_> {
1356 static Operand
const kOperand = Operand_;
1359 "MmaTensorOpMultiplicandIterator for ColumnMajor Crosswise may " 1360 "only be instantiated for B operand to warp-level Mma.");
1363 using Element = Element_;
1366 static int const kCrosswise = Crosswise;
1377 static int const kOpDelta = OpDelta_;
1380 static int const kThreads = 32;
1396 layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
1400 InstructionShape::kColumn>,
1401 kOpDelta, kThreads, PartitionsK_>;
1409 using Fragment = Array<Element, Shape::kCount / kThreads>;
1423 : iterator_({ref.
data(), ref.
stride()}, lane_id) {}
1428 iterator_.add_pointer_offset(offset);
1438 iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
1464 add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
1473 add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
1487 Index pointer_offset)
const {
1488 iterator_.load_with_pointer_offset(frag, pointer_offset);
1497 Index byte_offset)
const {
1498 iterator_.load_with_byte_offset(frag, byte_offset);
1520 Index pointer_offset)
const {
1533 Index byte_offset)
const {
1534 iterator_.load_with_byte_offset(
1535 frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
1547 iterator_.set_kgroup_index(k_group);
1568 typename InstructionShape_,
1577 Shape_, Operand_, Element_,
1579 sizeof_bits<Element_>::value, Crosswise>,
1580 InstructionShape_, OpDelta_, 32, PartitionsK_> {
1586 static Operand
const kOperand = Operand_;
1589 "MmaTensorOpMultiplicandIterator for RowMajor Crosswise may " 1590 "only be instantiated for A operand to warp-level Mma.");
1593 using Element = Element_;
1596 static int const kCrosswise = Crosswise;
1607 static int const kOpDelta = OpDelta_;
1610 static int const kThreads = 32;
1626 layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
1630 InstructionShape::kRow>,
1631 kOpDelta, kThreads, PartitionsK_>;
1639 using Fragment = Array<Element, Shape::kCount / kThreads>;
1653 : iterator_({ref.
data(), ref.
stride()}, lane_id) {}
1658 iterator_.add_pointer_offset(offset);
1668 iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
1694 add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
1703 add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
1717 Index pointer_offset)
const {
1718 iterator_.load_with_pointer_offset(frag, pointer_offset);
1727 Index byte_offset)
const {
1728 iterator_.load_with_byte_offset(frag, byte_offset);
1750 Index pointer_offset)
const {
1763 Index byte_offset)
const {
1764 iterator_.load_with_byte_offset(
1765 frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
1777 iterator_.set_kgroup_index(k_group);
1790 typename InstructionShape_,
1812 typename InstructionShape_,
1827 using Element = Element_;
1839 static int const kThreads = 32;
1856 !(Shape::kRow % InstructionShape::kM) &&
1857 !(Shape::kColumn % InstructionShape::kN),
1858 "Shape of warp-level Mma must be divisible by operator shape.");
1861 "Layouts must be defined for logical MatrixCoord coordinate space.");
1865 Shape::kColumn / InstructionShape::kN>;
1873 static int const kElementsPerAccess = InstructionShape::kN / 4;
1874 static int const kRowsPerTile = 8;
1875 static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
1884 using Fragment = Array<Element, Shape::kCount / kThreads>;
1905 int quad = (lane_id >> 2);
1906 int lane_in_quad = (lane_id & 3);
1908 MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
1946 add_tile_offset(tile_offset);
1953 add_tile_offset(-tile_offset);
1960 load_with_pointer_offset(frag, 0);
1967 Index pointer_offset)
const {
1973 for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
1975 for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
1977 int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
1978 (mma_n * Policy::MmaIterations::kRow + mma_m);
1981 for (
int row = 0; row < kAccumulatorRows; ++row) {
1983 for (
int col = 0; col < kElementsPerAccess; ++col) {
1984 int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
1986 int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
1988 frag[mma_accum_start + row * kElementsPerAccess + col] = offset_ref.
at({accum_m, accum_n});
1999 Index byte_offset)
const {
2001 load_with_pointer_offset(byte_offset /
sizeof(Element));
2010 load(frag, tile_offset, 0);
2018 Index pointer_offset)
const {
2020 load_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
2026 store_with_pointer_offset(frag, 0);
2033 Index pointer_offset)
const {
2039 for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
2041 for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
2043 int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
2044 (mma_n * Policy::MmaIterations::kRow + mma_m);
2047 for (
int row = 0; row < kAccumulatorRows; ++row) {
2049 for (
int col = 0; col < kElementsPerAccess; ++col) {
2050 int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
2052 int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
2053 int idx = mma_accum_start + row * kElementsPerAccess + col;
2055 offset_ref.
at({accum_m, accum_n}) = frag[idx];
2066 Index byte_offset)
const {
2068 store_with_pointer_offset(byte_offset /
sizeof(Element));
2077 store(frag, tile_offset, 0);
2088 Index pointer_offset)
const {
2089 store_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
2109 typename InstructionShape_,
2115 InstructionShape_, OpDelta_> {
2125 using Element = Element_;
2137 static int const kThreads = 32;
2154 !(Shape::kRow % InstructionShape::kM) &&
2155 !(Shape::kColumn % InstructionShape::kN),
2156 "Shape of warp-level Mma must be divisible by operator shape.");
2159 "Layouts must be defined for logical MatrixCoord coordinate space.");
2163 Shape::kColumn / InstructionShape::kN>;
2171 static int const kElementsPerAccess = InstructionShape::kN / 4;
2172 static int const kRowsPerTile = 8;
2173 static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
2182 using Fragment = Array<Element, Shape::kCount / kThreads>;
2203 int quad = (lane_id >> 2);
2204 int lane_in_quad = (lane_id & 3);
2206 MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
2244 add_tile_offset(tile_offset);
2251 add_tile_offset(-tile_offset);
2258 load_with_pointer_offset(frag, 0);
2265 Index pointer_offset)
const {
2271 for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
2273 for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
2275 int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
2276 (mma_n * Policy::MmaIterations::kRow + mma_m);
2279 for (
int row = 0; row < kAccumulatorRows; ++row) {
2281 for (
int col = 0; col < kElementsPerAccess; ++col) {
2282 int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
2284 int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
2285 int idx = mma_accum_start + row * kElementsPerAccess + col;
2287 frag[idx] = offset_ref.
at({accum_m, accum_n});
2298 Index byte_offset)
const {
2300 load_with_pointer_offset(byte_offset /
sizeof(Element));
2309 load(frag, tile_offset, 0);
2317 Index pointer_offset)
const {
2319 load_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
2325 store_with_pointer_offset(frag, 0);
2332 Index pointer_offset)
const {
2338 for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
2340 for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
2342 int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
2343 (mma_n * Policy::MmaIterations::kRow + mma_m);
2346 for (
int row = 0; row < kAccumulatorRows; ++row) {
2348 for (
int col = 0; col < kElementsPerAccess; ++col) {
2349 int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
2351 int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
2352 int idx = mma_accum_start + row * kElementsPerAccess + col;
2354 offset_ref.
at({accum_m, accum_n}) = frag[idx];
2365 Index byte_offset)
const {
2367 store_with_pointer_offset(byte_offset /
sizeof(Element));
2376 store(frag, tile_offset, 0);
2387 Index pointer_offset)
const {
2388 store_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
2409 typename InstructionShape_,
2417 InstructionShape_, OpDelta_> {
2427 using Element = Element_;
2439 static int const kThreads = 32;
2456 !(Shape::kRow % InstructionShape::kM) &&
2457 !(Shape::kColumn % InstructionShape::kN),
2458 "Shape of warp-level Mma must be divisible by operator shape.");
2461 "Layouts must be defined for logical MatrixCoord coordinate space.");
2465 Shape::kColumn / InstructionShape::kN>;
2470 static int const kElementsPerAccess = 2;
2481 using Fragment = Array<Element, Shape::kCount / kThreads>;
2502 int quad = (lane_id >> 2);
2503 int lane_in_quad = (lane_id & 3);
2505 MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
2543 add_tile_offset(tile_offset);
2550 add_tile_offset(-tile_offset);
2557 load_with_pointer_offset(frag, 0);
2564 Index pointer_offset)
const {
2572 for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
2574 for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
2575 int accum_m = mma_m * InstructionShape::kM;
2576 int accum_n = mma_n * InstructionShape::kN;
2578 int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
2583 frag_ptr[idx] = access_ptr[0];
2592 Index byte_offset)
const {
2594 load_with_pointer_offset(byte_offset /
sizeof(Element));
2603 load(frag, tile_offset, 0);
2611 Index pointer_offset)
const {
2613 load_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
2619 store_with_pointer_offset(frag, 0);
2626 Index pointer_offset)
const {
2634 for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
2636 for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
2637 int accum_m = mma_m * InstructionShape::kM;
2638 int accum_n = mma_n * InstructionShape::kN;
2640 int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
2645 access_ptr[0] = frag_ptr[idx];
2654 Index byte_offset)
const {
2656 store_with_pointer_offset(byte_offset /
sizeof(Element));
2665 store(frag, tile_offset, 0);
2676 Index pointer_offset)
const {
2677 store_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator.h:1836
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:2119
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:1389
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:1139
Describes the size of a matrix tile.
Definition: matrix_shape.h:42
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:1652
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:1915
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator.h:476
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:1052
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:2433
Definition: aligned_buffer.h:35
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:483
Defines a structure containing strides, bounds, and a pointer to tensor data.
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1514
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1259
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:966
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:223
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:569
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1207
Definition: tensor_op_multiplicand_sm75.h:734
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:257
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:2496
CUTLASS_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const
Stores a fragment to memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2073
CUTLASS_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const
Stores a fragment to memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2661
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:2236
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:765
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1163
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:773
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:2296
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:1845
CUTLASS_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Stores a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2381
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator.h:1026
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:219
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:732
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:328
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:321
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1952
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:408
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:1418
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:387
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1692
Operand
GEMM operand enumeration: D = A * B + C.
Definition: include/cutlass/gemm/gemm.h:39
CUTLASS_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Stores a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2082
Definition: tensor_op_multiplicand_sm75.h:422
Architecture-specific operators on memory added for SM75.
Definition: tensor_op_multiplicand_sm75.h:835
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1279
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:1616
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:872
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:1224
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:293
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:840
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1965
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:1386
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1922
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:1945
Defines common types used for all GEMM-like operators.
Definition: tensor_op_multiplicand_sm75.h:213
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:432
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:1648
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator.h:1583
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:2143
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:1833
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1683
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:2562
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:816
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2031
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:2146
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:2263
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:726
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:809
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:2243
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:1895
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator.h:1884
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:860
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:599
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1216
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:609
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1757
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:2229
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the opposite of the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:312
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2519
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:1546
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:714
CUTLASS_HOST_DEVICE TensorRef & add_coord_offset(TensorCoord const &coord)
Adds an offset to each pointer.
Definition: tensor_ref.h:326
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:641
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:266
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:2556
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1723
Mapping function for column-major matrices.
Definition: layout/matrix.h:142
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1938
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:2445
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:2197
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:830
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:2528
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:782
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1666
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator.h:202
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1503
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1483
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2220
Defines layout functions used by TensorRef and derived classes for common 4-D and 5-D tensor formats...
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:1776
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator.h:466
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator.h:2436
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:1851
CUTLASS_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2652
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:2193
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object's stride vector.
Definition: tensor_ref.h:277
CUTLASS_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const
Stores a fragment to memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2372
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:1427
typename Layout::TensorCoord TensorCoord
Coordinate in logical tensor space.
Definition: tensor_ref.h:171
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory.
Definition: mma_tensor_op_tile_iterator.h:2025
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:534
Defines a Shape template for matrix tiles.
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:386
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1527
Defines the size of an element in bits.
Definition: numeric_types.h:42
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:342
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1291
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1997
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1471
Definition: mma_tensor_op_tile_iterator.h:1794
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator.h:1409
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator.h:750
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:1622
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:1657
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:761
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1228
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:2549
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory.
Definition: mma_tensor_op_tile_iterator.h:2618
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:823
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1675
CUTLASS_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2064
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1436
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator.h:107
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:791
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:138
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:1373
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:662
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:2421
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:969
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator.h:2134
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:1316
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:560
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:893
Top-level include for all CUTLASS numeric types.
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301
Definition: mma_tensor_op_tile_iterator.h:75
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:2512
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator.h:930
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:141
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2599
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1493
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:1959
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1931
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2608
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1445
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1453
CUTLASS_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2363
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2305
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:585
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape)
Definition: mma_tensor_op_tile_iterator.h:123
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:542
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:729
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:1848
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:1709
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator.h:519
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:1619
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator.h:495
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:1422
typename Layout::Index Index
Index type.
Definition: tensor_ref.h:165
Mapping function for row-major matrices.
Definition: layout/matrix.h:50
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1744
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2015
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:2131
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1269
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:1821
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape)
Definition: mma_tensor_op_tile_iterator.h:950
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:530
CUTLASS_HOST_DEVICE Reference at(TensorCoord const &coord) const
Returns a reference to the element at a given Coord.
Definition: tensor_ref.h:307
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory.
Definition: mma_tensor_op_tile_iterator.h:2324
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:376
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:2257
Defines layout functions used by TensorRef and derived classes.
CUTLASS_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Stores a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2670
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator.h:1353
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:396
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator.h:697
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1701
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1148
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:578
Array< Element, kElementsPerAccess > AccessType
Definition: mma_tensor_op_tile_iterator.h:2478
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:498
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
Definition: layout/matrix.h:343
CUTLASS_HOST_DEVICE TensorRef & add_pointer_offset(LongIndex offset_)
Adds an offset to each pointer.
Definition: tensor_ref.h:319
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator.h:1639
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:972
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1733
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:335
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:1479
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2330
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1713
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:1061
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:2590
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:1899
Definition: tensor_op_multiplicand_sm75.h:632
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator.h:116
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:2448
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:629
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:1392
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:2250
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:850
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1202
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:619
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:2149
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator.h:2182
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2314
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:2535
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:800
Basic include for CUTLASS.
Definition: matrix_coord.h:39
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:2492
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:144
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:592
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1462
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:2542
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:501
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2006
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2624
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:2451
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:1603
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator.h:2481
typename Layout::LongIndex LongIndex
Long index used for pointer offsets.
Definition: tensor_ref.h:168
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:551
Definition: tensor_op_multiplicand_sm75.h:527
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:2213