64 typename InstructionShape_,
85 typename InstructionShape_,
90 Shape_, Operand::
kA, Element_,
92 sizeof_bits<Element_>::value>,
93 InstructionShape_, OpDelta_, 32> {
112 static int const kOpDelta = OpDelta_;
115 static int const kThreads = 32;
132 !(Shape::kContiguous % InstructionShape::kContiguous),
133 "Shape of warp-level Mma must be divisible by operator shape.");
144 InstructionShape::kStrided / LdsShape::kStrided,
145 Shape::kContiguous / LdsShape::kContiguous
153 "Alternative arrangements not supported at present.");
156 static int const kPointerCount = 2;
193 stride_(ref.stride(0) /
Layout::kElementsPerAccess), byte_offset_(0) {
198 int vec_row = (lane_id >> 4);
199 int vec_col = ((lane_id & 4) >> 2);
202 for (
int i = 0; i < kPointerCount; ++i) {
207 int access_contiguous_idx = (vec_col << 2) | ((lane_id & 3) ^ vec_row);
208 int access_contiguous = access_contiguous_idx;
210 int access_strided = vec_row;
212 access_contiguous + access_strided * stride_;
221 byte_offset_ += offset *
sizeof(Element);
230 int contiguous_offset = tile_offset.contiguous();
231 int strided_offset = tile_offset.strided();
234 if (Shape::kContiguous == Policy::LdsShape::kContiguous) {
235 if (contiguous_offset % 2) {
237 pointer_[0] = pointer_[1];
238 pointer_[1] = tmp_pointer;
240 contiguous_offset = contiguous_offset / 2;
243 int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
244 Layout::kElementsPerAccess +
245 contiguous_offset * Shape::kContiguous;
247 add_pointer_offset(offset);
255 byte_offset_ += stride_ * InstructionShape::kStrided *
sizeof(Element) *
256 Layout::kElementsPerAccess;
264 byte_offset_ -= stride_ * InstructionShape::kStrided *
sizeof(Element) *
265 Layout::kElementsPerAccess;
273 add_tile_offset(tile_offset);
280 add_tile_offset(-tile_offset);
288 load_with_byte_offset(frag, 0);
297 Index byte_offset)
const {
302 for (
int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
305 for (
int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
307 int access_idx = c + s * Policy::LdsIterations::kContiguous;
309 AccessType const *source_ptr = pointer_[s & 1] +
310 Policy::LdsShape::kContiguous * c +
311 Policy::LdsShape::kStrided * (s / 2) * stride_;
313 char const *source_byte_ptr =
reinterpret_cast<char const *
>(source_ptr) + byte_offset + byte_offset_;
314 fetch_ptr[access_idx] = *(
reinterpret_cast<AccessType const*
> (source_byte_ptr));
325 Index pointer_offset)
const {
326 load_with_byte_offset(frag, pointer_offset *
sizeof(Element));
336 load_with_byte_offset(frag, tile_offset, 0);
347 Index pointer_offset)
const {
348 load_with_byte_offset(frag, tile_offset, pointer_offset *
sizeof(Element));
359 Index byte_offset)
const {
360 Index pointer_offset =
361 tile_offset.contiguous() * Shape::kContiguous /
362 Layout::kElementsPerAccess +
363 tile_offset.strided() * InstructionShape::kStrided * stride_;
365 byte_offset +=
sizeof(
AccessType) * pointer_offset;
367 load_with_byte_offset(frag, byte_offset);
396 typename InstructionShape_,
402 Shape_, Operand::
kB, Element_,
404 sizeof_bits<Element_>::value>,
405 InstructionShape_, OpDelta_, 32> {
424 static int const kOpDelta = OpDelta_;
427 static int const kThreads = 32;
444 !(Shape::kContiguous % InstructionShape::kContiguous),
445 "Shape of warp-level Mma must be divisible by operator shape.");
455 Shape::kContiguous / LdsShape::kContiguous,
456 InstructionShape::kStrided / LdsShape::kStrided
464 "Alternative arrangements not supported at present.");
501 stride_(ref.stride(0) /
Layout::kElementsPerAccess), byte_offset_(0) {
504 int access_strided = (lane_id >> 3) & 0x3;
505 int access_contiguous = ((lane_id ^ (lane_id >> 3)) & 0x3);
508 access_contiguous + access_strided * stride_;
516 byte_offset_ += offset *
sizeof(Element);
525 int contiguous_offset = tile_offset.contiguous();
526 int strided_offset = tile_offset.strided();
528 int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
529 Layout::kElementsPerAccess +
530 contiguous_offset * Shape::kContiguous;
532 add_pointer_offset(offset);
540 byte_offset_ += stride_ * InstructionShape::kStrided *
sizeof(Element) *
541 Layout::kElementsPerAccess;
549 byte_offset_ += stride_ * InstructionShape::kStrided *
sizeof(Element) *
550 Layout::kElementsPerAccess;
558 add_tile_offset(tile_offset);
565 add_tile_offset(-tile_offset);
573 load_with_byte_offset(frag, 0);
582 Index byte_offset)
const {
587 for (
int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
590 for (
int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
592 int access_idx = c + s * Policy::LdsIterations::kContiguous;
595 Policy::LdsShape::kContiguous / Layout::kElementsPerAccess * c +
596 Policy::LdsShape::kStrided * s * stride_;
598 char const *source_byte_ptr =
reinterpret_cast<char const *
>(source_ptr) + byte_offset + byte_offset_;
599 fetch_ptr[access_idx] = *(
reinterpret_cast<AccessType const*
> (source_byte_ptr));
610 Index pointer_offset)
const {
611 load_with_byte_offset(frag, pointer_offset *
sizeof(Element));
621 load_with_byte_offset(frag, tile_offset, 0);
632 Index pointer_offset)
const {
633 load_with_byte_offset(frag, tile_offset, pointer_offset *
sizeof(Element));
644 Index byte_offset)
const {
645 Index pointer_offset =
646 tile_offset.contiguous() * Shape::kContiguous /
647 Layout::kElementsPerAccess +
648 tile_offset.strided() * InstructionShape::kStrided * stride_;
650 byte_offset +=
sizeof(
AccessType) * pointer_offset;
652 load_with_byte_offset(frag, byte_offset);
682 typename InstructionShape_,
687 Shape_, Operand::
kA, Element_,
689 sizeof_bits<Element_>::value>,
690 InstructionShape_, OpDelta_, 32> {
709 static int const kOpDelta = OpDelta_;
712 static int const kThreads = 32;
731 InstructionShape::kColumn>,
741 using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
759 ): iterator_({ref.
data(), ref.
stride()}, lane_id) {
766 iterator_.add_pointer_offset(offset);
775 iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
801 add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
808 add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
816 iterator_.load(frag);
825 Index pointer_offset)
const {
826 iterator_.load_with_pointer_offset(frag, pointer_offset);
835 Index byte_offset)
const {
836 iterator_.load_with_byte_offset(frag, byte_offset);
857 Index pointer_offset)
const {
869 Index byte_offset)
const {
870 iterator_.load_with_byte_offset(
872 {tile_offset.contiguous(), tile_offset.strided()},
885 iterator_.set_kgroup_index(k_group);
903 typename InstructionShape_,
908 Shape_, Operand::
kB, Element_,
910 sizeof_bits<Element_>::value>,
911 InstructionShape_, OpDelta_, 32> {
921 "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
933 static int const kOpDelta = OpDelta_;
936 static int const kThreads = 32;
955 InstructionShape::kRow>,
965 using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
983 ): iterator_({ref.
data(), ref.
stride()}, lane_id) {
990 iterator_.add_pointer_offset(offset);
999 iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
1025 add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
1032 add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
1040 iterator_.load(frag);
1049 Index pointer_offset)
const {
1050 iterator_.load_with_pointer_offset(frag, pointer_offset);
1059 Index byte_offset)
const {
1060 iterator_.load_with_byte_offset(frag, byte_offset);
1081 Index pointer_offset)
const {
1093 Index byte_offset)
const {
1094 iterator_.load_with_byte_offset(
1096 {tile_offset.strided(), tile_offset.contiguous()},
1109 iterator_.set_kgroup_index(k_group);
1131 typename InstructionShape_,
1157 static int const kThreads = 32;
1177 static_assert(!(Shape::kRow % InterleavedTile::kRow) && !(Shape::kColumn % InterleavedTile::kColumn),
1178 "Shape of warp-level Mma must be divisible by operator shape.");
1181 "Layouts must be defined for logical MatrixCoord coordinate space.");
1185 Shape::kRow / InterleavedTile::kRow,
1186 Shape::kColumn / InterleavedTile::kColumn
1190 MatrixShape<InterleavedTile::kRow / InstructionShape::kM,
1191 InterleavedTile::kColumn / InstructionShape::kN>;
1197 static int const kElementsPerPartial = 4;
1202 static int const kElementsPerMma = 8;
1203 static int const kAccumulatorPatials = 2;
1234 int quad = (lane_id >> 2);
1235 int lane_in_quad = (lane_id & 3);
1236 int accum_m, accum_n;
1238 if (platform::is_same<Element, float>::value) {
1240 accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
1242 accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
1245 accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + lane_in_quad;
1246 accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
1286 add_tile_offset(tile_offset);
1293 add_tile_offset(-tile_offset);
1300 load_with_pointer_offset(frag, 0);
1307 Index pointer_offset)
const {
1313 for (
int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
1315 for (
int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
1317 for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
1319 for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
1321 int mma_accum_start =
1322 (((tile_n * Policy::TileIterations::kRow + tile_m) *
1323 Policy::MmaIterations::kColumn + mma_n) *
1324 Policy::MmaIterations::kRow + mma_m) *
1328 for (
int p = 0; p < kAccumulatorPatials; ++p) {
1330 for (
int m = 0; m < EleShapePerPatial::kRow; ++m) {
1332 for (
int n = 0; n < EleShapePerPatial::kColumn; ++n) {
1333 int accum_m = tile_m * Policy::InterleavedTile::kRow +
1334 mma_m * QuadShapePerPatialMma::kRow + m * 2;
1335 int accum_n = tile_n * Policy::InterleavedTile::kColumn +
1336 mma_n * QuadShapePerPatialMma::kColumn +
1337 p * Policy::InterleavedTile::kColumn/2 + n;
1338 int idx = mma_accum_start + p * kElementsPerPartial +
1339 m * EleShapePerPatial::kColumn + n;
1340 frag[idx] = offset_ref.
at({accum_m, accum_n});
1353 Index byte_offset)
const {
1355 load_with_pointer_offset(byte_offset /
sizeof(Element));
1364 load(frag, tile_offset, 0);
1372 Index pointer_offset)
const {
1374 load_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
1380 store_with_pointer_offset(frag, 0);
1387 Index pointer_offset)
const {
1393 for (
int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
1395 for (
int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
1397 for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
1399 for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
1401 int mma_accum_start =
1402 (((tile_n * Policy::TileIterations::kRow + tile_m) *
1403 Policy::MmaIterations::kColumn + mma_n) *
1404 Policy::MmaIterations::kRow + mma_m) *
1408 for (
int p = 0; p < kAccumulatorPatials; ++p) {
1410 for (
int m = 0; m < EleShapePerPatial::kRow; ++m) {
1412 for (
int n = 0; n < EleShapePerPatial::kColumn; ++n) {
1413 int accum_m = tile_m * Policy::InterleavedTile::kRow +
1414 mma_m * QuadShapePerPatialMma::kRow + m * 2;
1415 int accum_n = tile_n * Policy::InterleavedTile::kColumn +
1416 mma_n * QuadShapePerPatialMma::kColumn +
1417 p * Policy::InterleavedTile::kColumn/2 + n;
1418 int idx = mma_accum_start + p * kElementsPerPartial +
1419 m * EleShapePerPatial::kColumn + n;
1420 offset_ref.
at({accum_m, accum_n}) = frag[idx];
1434 Index byte_offset)
const {
1436 store_with_pointer_offset(byte_offset /
sizeof(Element));
1445 store(frag, tile_offset, 0);
1456 Index pointer_offset)
const {
1457 store_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
1476 typename InstructionShape_,
1483 Shape_, Operand_, Element_,
1485 sizeof_bits<Element_>::value, KBlock>,
1486 InstructionShape_, OpDelta_, 32> {
1492 static Operand
const kOperand = Operand_;
1495 "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for " 1496 "A or B operands to warp-level Mma.");
1502 static int const kKBlock = KBlock;
1513 static int const kOpDelta = OpDelta_;
1516 static int const kThreads = 32;
1540 static int const kElementsPerAccess = 8;
1543 static int const kContiguousElementsPerLine = 4;
1549 "Alternative arrangements not supported at present.");
1594 : pointer_(reinterpret_cast<
AccessType const *>(ref.data())),
1595 stride_(ref.stride(0) * Policy::kElementsPerAccess),
1596 line_size((ref.stride(0) * Policy::kContiguousElementsPerLine) /
1597 Policy::kElementsPerAccess),
1601 int quad = (lane_id / 4);
1602 int lane_in_quad = (lane_id % 4);
1603 int access_contiguous;
1608 access_contiguous = ((quad & 0x4) << 1) + ((lane_in_quad) << 1) +
1609 ((quad & 0x1) ^ ((quad & 0x4) >> 2));
1613 access_contiguous = ((quad & 0x4) << 1) + (lane_in_quad << 1) +
1614 ((quad & 0x2) >> 1 ^ ((quad & 0x4) >> 2));
1617 byte_offset_ = access_contiguous *
1618 sizeof(Element) * Policy::kElementsPerAccess;
1624 byte_offset_ += offset *
sizeof(Element);
1635 int contiguous_offset = tile_offset.contiguous();
1636 int strided_offset = tile_offset.strided();
1639 pointer_ += contiguous_offset *
1640 (InstructionShape::kContiguous /
1641 Policy::kContiguousElementsPerLine) *
1643 strided_offset * Shape::kStrided / 2;
1650 k_group_idx_ = (k_group_idx_ + 1) % 8;
1652 if (k_group_idx_ == 4 || k_group_idx_ == 0) {
1653 byte_offset_ ^= 1 *
sizeof(Element) * Policy::kElementsPerAccess;
1656 pointer_ += line_size;
1669 add_tile_offset(tile_offset);
1678 add_tile_offset(-tile_offset);
1692 Index byte_offset)
const {
1697 for (
int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
1700 for (
int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
1702 int access_idx = c + s * Policy::LdsIterations::kContiguous;
1705 Policy::LdsShape::kContiguous * c * line_size +
1706 Policy::LdsShape::kStrided * s / 2;
1708 char const *source_byte_ptr =
reinterpret_cast<char const *
>(source_ptr) + byte_offset + byte_offset_;
1709 fetch_ptr[access_idx] = *(
reinterpret_cast<AccessType const*
> (source_byte_ptr));
1712 if (k_group_idx_ & 0x2) {
1713 uint64_t *low =
reinterpret_cast<uint64_t *
>(&frag) + access_idx * 2;
1714 uint64_t *high =
reinterpret_cast<uint64_t *
>(&frag) + access_idx * 2 + 1;
1715 uint64_t tmp = *low;
1729 Index pointer_offset)
const {
1730 load_with_byte_offset(frag, pointer_offset *
sizeof(Element));
1740 load_with_byte_offset(frag, tile_offset, 0);
1751 Index pointer_offset)
const {
1752 load_with_byte_offset(frag, tile_offset, pointer_offset *
sizeof(Element));
1763 Index byte_offset)
const {
1764 Index pointer_offset = tile_offset.contiguous() *
1765 InstructionShape::kContiguous /
1766 Policy::kElementsPerAccess +
1767 tile_offset.strided() * Shape::kStrided * stride_;
1769 byte_offset +=
sizeof(
AccessType) * pointer_offset;
1771 load_with_byte_offset(frag, byte_offset);
1783 k_group_idx_ = k_group;
1802 typename InstructionShape_,
1809 Shape_, Operand_, Element_,
1811 sizeof_bits<Element_>::value, KBlock>,
1812 InstructionShape_, OpDelta_, 32> {
1818 static Operand
const kOperand = Operand_;
1821 "MmaTensorOpMultiplicandIterator may only be instantiated for " 1822 "A or B operands to warp-level Mma.");
1828 static int const kKBlock = KBlock;
1840 static int const kOpDelta = OpDelta_;
1843 static int const kThreads = 32;
1863 InstructionShape::kColumn>,
1864 kOpDelta, kThreads>;
1872 using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
1886 : iterator_({ref.
data(), ref.
stride()}, lane_id) {}
1891 iterator_.add_pointer_offset(offset);
1901 iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
1927 add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
1936 add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
1950 Index pointer_offset)
const {
1951 iterator_.load_with_pointer_offset(frag, pointer_offset);
1960 Index byte_offset)
const {
1961 iterator_.load_with_byte_offset(frag, byte_offset);
1983 Index pointer_offset)
const {
1996 Index byte_offset)
const {
1997 iterator_.load_with_byte_offset(
1998 frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
2010 iterator_.set_kgroup_index(k_group);
2031 typename InstructionShape_,
2038 Shape_, Operand_, Element_,
2040 sizeof_bits<Element_>::value, KBlock>,
2041 InstructionShape_, OpDelta_, 32> {
2047 static Operand
const kOperand = Operand_;
2050 "MmaTensorOpMultiplicandIterator may only be instantiated for " 2051 "A or B operands to warp-level Mma.");
2057 static int const kKBlock = KBlock;
2068 static int const kOpDelta = OpDelta_;
2071 static int const kThreads = 32;
2091 InstructionShape::kRow>,
2092 kOpDelta, kThreads>;
2100 using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
2114 : iterator_({ref.
data(), ref.
stride()}, lane_id) {}
2119 iterator_.add_pointer_offset(offset);
2129 iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
2155 add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
2164 add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
2178 Index pointer_offset)
const {
2179 iterator_.load_with_pointer_offset(frag, pointer_offset);
2188 Index byte_offset)
const {
2189 iterator_.load_with_byte_offset(frag, byte_offset);
2211 Index pointer_offset)
const {
2224 Index byte_offset)
const {
2225 iterator_.load_with_byte_offset(
2226 frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
2238 iterator_.set_kgroup_index(k_group);
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1369
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:2194
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:630
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1031
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:254
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:353
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:189
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:539
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:127
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1166
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:2077
Describes the size of a matrix tile.
Definition: matrix_shape.h:42
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:821
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:2064
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1151
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:2044
Definition: mma_tensor_op_tile_iterator_sm70.h:70
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:436
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:626
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:279
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Stores a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1450
Definition: aligned_buffer.h:35
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1045
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:965
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:945
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:2170
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1925
Defines a structure containing strides, bounds, and a pointer to tensor data.
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:1782
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:980
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:606
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator_sm70.h:1154
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:1560
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1224
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:694
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:121
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1385
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:1024
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:185
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:1108
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1299
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:415
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:219
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:387
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1725
Operand
GEMM operand enumeration: D = A * B + C.
Definition: include/cutlass/gemm/gemm.h:39
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1881
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1499
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:924
Aligned array type.
Definition: array.h:511
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1278
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:439
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1360
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:884
Defines common types used for all GEMM-like operators.
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:782
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:1855
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1262
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1584
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1522
Definition: tensor_op_multiplicand_sm70.h:848
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:2174
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1825
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1087
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:1228
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1956
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:1169
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1688
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1271
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:1213
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:523
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1139
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:286
CUTLASS_HOST_DEVICE TensorRef & add_coord_offset(TensorCoord const &coord)
Adds an offset to each pointer.
Definition: tensor_ref.h:326
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:1255
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:1379
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:791
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:807
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1966
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile, needs on more time number of registers.
Definition: mma_tensor_op_tile_iterator_sm70.h:476
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1006
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1836
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1667
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:741
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1145
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object's stride vector.
Definition: tensor_ref.h:277
typename Layout::TensorCoord TensorCoord
Coordinate in logical tensor space.
Definition: tensor_ref.h:171
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:168
CUTLASS_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1292
Defines a Shape template for matrix tiles.
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:263
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1038
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1852
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1934
Defines the size of an element in bits.
Definition: numeric_types.h:42
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:831
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1815
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1055
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:228
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:706
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:548
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:2162
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:616
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:841
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1684
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:756
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:948
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:331
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:752
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:293
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:421
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:1885
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1489
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:976
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:1890
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1065
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:409
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1676
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:800
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:2118
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1899
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1305
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1757
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:1528
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:2009
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:2083
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:718
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:397
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:564
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1662
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:997
Top-level include for all CUTLASS numeric types.
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1015
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:863
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:497
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:764
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:2144
CUTLASS_HOST_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const
Stores a fragment to memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1441
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1509
Internal structure of iterator - made public to enable introspection.
Definition: mma_tensor_op_tile_iterator_sm70.h:1172
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:321
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:638
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:124
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1632
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:514
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:2205
Definition: tensor_op_multiplicand_sm70.h:733
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:988
Definition: tensor_op_multiplicand_sm70.h:943
typename Layout::Index Index
Index type.
Definition: tensor_ref.h:165
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:942
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1163
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1351
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:341
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:851
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:571
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:1872
CUTLASS_HOST_DEVICE Reference at(TensorCoord const &coord) const
Returns a reference to the element at a given Coord.
Definition: tensor_ref.h:307
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:2184
Definition: mma_tensor_op_tile_iterator_sm70.h:1135
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:578
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:1623
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1735
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:915
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1977
Defines layout functions used by TensorRef and derived classes.
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:2100
CUTLASS_HOST_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1432
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:2054
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:2109
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1745
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:97
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:814
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:773
CUTLASS_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:1285
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1942
CUTLASS_HOST_DEVICE TensorRef & add_pointer_offset(LongIndex offset_)
Adds an offset to each pointer.
Definition: tensor_ref.h:319
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:272
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1849
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:191
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1990
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1946
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:663
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:2113
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:2237
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:103
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:557
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:930
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:2136
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1649
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1075
Layout_ Layout
Layout of source tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:1148
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1916
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:2153
TensorRef< Element, Layout > TensorRef
TensorRef type for loading element from a tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:1519
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:60
Basic include for CUTLASS.
Definition: matrix_coord.h:39
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:724
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1908
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:721
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:700
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:2218
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:2127
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:493
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:378
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:433
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:109
typename Layout::LongIndex LongIndex
Long index used for pointer offsets.
Definition: tensor_ref.h:168
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:1593
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1525
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:2080