CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
mma_tensor_op_tile_iterator_sm70.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/cutlass.h"
32 
33 #include "cutlass/array.h"
34 #include "cutlass/numeric_types.h"
35 #include "cutlass/tensor_ref.h"
36 #include "cutlass/matrix_shape.h"
37 
38 #include "cutlass/gemm/gemm.h"
39 
40 #include "cutlass/layout/matrix.h"
43 
45 
47 
48 namespace cutlass {
49 namespace gemm {
50 namespace warp {
51 
53 
54 template <
56  typename Shape_,
60  typename Element_,
62  typename Layout_,
64  typename InstructionShape_,
67  int OpDelta_,
69  int Threads>
71 
73 
79 template <
81  typename Shape_,
83  typename Element_,
85  typename InstructionShape_,
88  int OpDelta_>
90  Shape_, Operand::kA, Element_,
92  sizeof_bits<Element_>::value>,
93  InstructionShape_, OpDelta_, 32> {
94  public:
95 
97  using Shape = Shape_;
98 
100  static Operand const kOperand = Operand::kA;
101 
103  using Element = Element_;
104 
107 
109  using InstructionShape = InstructionShape_;
110 
112  static int const kOpDelta = OpDelta_;
113 
115  static int const kThreads = 32;
116 
119 
121  using Index = typename TensorRef::Index;
122 
124  using LongIndex = typename TensorRef::LongIndex;
125 
128 
130  struct Policy {
132  !(Shape::kContiguous % InstructionShape::kContiguous),
133  "Shape of warp-level Mma must be divisible by operator shape.");
134 
135  // Shape of one individual LDS.128
136  // TODO: 32 and 4 are hardcoded, 32-by-4 is logical shape
138  32,
139  4
140  >;
141 
142  // LdsShapes are arranged in the strided direction in SMEM
144  InstructionShape::kStrided / LdsShape::kStrided,
145  Shape::kContiguous / LdsShape::kContiguous
146  >;
147  };
148 
149 private:
150 
152  static_assert(kOpDelta == 1,
153  "Alternative arrangements not supported at present.");
154 
156  static int const kPointerCount = 2;
157 
160 
161 public:
162 
163  //
164  // Derived quantities
165  //
166 
168  using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
169 
170 private:
171 
173  Index stride_;
174 
176  AccessType const *pointer_[kPointerCount];
177 
179  Index byte_offset_;
180 
181 public:
182 
185  MmaVoltaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
186 
188  CUTLASS_DEVICE
190  TensorRef const &ref,
191  int lane_id
192  ):
193  stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
194  // swizzle patterns for operandA LDS are
195  // 1. (tid[4] << 3) | (tid[2:0] ^ tid[4])
196  // 2. (tid[4] << 3) | (tid[2:0] ^ tid[4] ^ 0b10010)
197 
198  int vec_row = (lane_id >> 4); // tid[4]
199  int vec_col = ((lane_id & 4) >> 2); // tid[2]
200 
202  for (int i = 0; i < kPointerCount; ++i) {
203 
204  if(i == 1) {
205  vec_row |= 2;
206  }
207  int access_contiguous_idx = (vec_col << 2) | ((lane_id & 3) ^ vec_row);
208  int access_contiguous = access_contiguous_idx;
209 
210  int access_strided = vec_row;
211  pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
212  access_contiguous + access_strided * stride_;
213  }
214 
215  }
216 
218  CUTLASS_DEVICE
220 
221  byte_offset_ += offset * sizeof(Element);
222 
223  return *this;
224  }
225 
229 
230  int contiguous_offset = tile_offset.contiguous();
231  int strided_offset = tile_offset.strided();
232 
233  // To support 32x32 tile size
234  if (Shape::kContiguous == Policy::LdsShape::kContiguous) {
235  if (contiguous_offset % 2) {
236  AccessType const *tmp_pointer = pointer_[0];
237  pointer_[0] = pointer_[1];
238  pointer_[1] = tmp_pointer;
239  }
240  contiguous_offset = contiguous_offset / 2;
241  }
242 
243  int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
244  Layout::kElementsPerAccess +
245  contiguous_offset * Shape::kContiguous;
246 
247  add_pointer_offset(offset);
248 
249  return *this;
250  }
251 
253  CUTLASS_DEVICE
255  byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
256  Layout::kElementsPerAccess;
257 
258  return *this;
259  }
260 
264  byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
265  Layout::kElementsPerAccess;
266 
267  return *this;
268  }
269 
271  CUTLASS_DEVICE
273  add_tile_offset(tile_offset);
274  return *this;
275  }
276 
278  CUTLASS_DEVICE
280  add_tile_offset(-tile_offset);
281  return *this;
282  }
283 
286  void load(Fragment &frag) const {
287 
288  load_with_byte_offset(frag, 0);
289  }
290 
292  CUTLASS_DEVICE
295  Fragment &frag,
297  Index byte_offset) const {
298 
299  AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);
300 
302  for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
303 
305  for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
306 
307  int access_idx = c + s * Policy::LdsIterations::kContiguous;
308 
309  AccessType const *source_ptr = pointer_[s & 1] +
310  Policy::LdsShape::kContiguous * c +
311  Policy::LdsShape::kStrided * (s / 2) * stride_;
312 
313  char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
314  fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
315  }
316  }
317  }
318 
320  CUTLASS_DEVICE
323  Fragment &frag,
325  Index pointer_offset) const {
326  load_with_byte_offset(frag, pointer_offset * sizeof(Element));
327  }
328 
330  CUTLASS_DEVICE
331  void load(
333  Fragment &frag,
335  TensorCoord const &tile_offset) const {
336  load_with_byte_offset(frag, tile_offset, 0);
337  }
338 
340  CUTLASS_DEVICE
341  void load(
343  Fragment &frag,
345  TensorCoord const &tile_offset,
347  Index pointer_offset) const {
348  load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
349  }
350 
352  CUTLASS_DEVICE
355  Fragment &frag,
357  TensorCoord const &tile_offset,
359  Index byte_offset) const {
360  Index pointer_offset =
361  tile_offset.contiguous() * Shape::kContiguous /
362  Layout::kElementsPerAccess +
363  tile_offset.strided() * InstructionShape::kStrided * stride_;
364 
365  byte_offset += sizeof(AccessType) * pointer_offset;
366 
367  load_with_byte_offset(frag, byte_offset);
368  }
369 
377  CUTLASS_DEVICE
378  void set_kgroup_index(int k_group) {
379  // no operation here
380  }
381 };
382 
384 
390 template <
392  typename Shape_,
394  typename Element_,
396  typename InstructionShape_,
399  int OpDelta_>
400 
402  Shape_, Operand::kB, Element_,
404  sizeof_bits<Element_>::value>,
405  InstructionShape_, OpDelta_, 32> {
406  public:
407 
409  using Shape = Shape_;
410 
412  static Operand const kOperand = Operand::kB;
413 
415  using Element = Element_;
416 
419 
421  using InstructionShape = InstructionShape_;
422 
424  static int const kOpDelta = OpDelta_;
425 
427  static int const kThreads = 32;
428 
431 
433  using Index = typename TensorRef::Index;
434 
436  using LongIndex = typename TensorRef::LongIndex;
437 
440 
442  struct Policy {
444  !(Shape::kContiguous % InstructionShape::kContiguous),
445  "Shape of warp-level Mma must be divisible by operator shape.");
446 
447  // Shape of one individual LDS
448  // TODO: remove hardcoded 32 and 4
450  32,
451  4
452  >;
453 
455  Shape::kContiguous / LdsShape::kContiguous,
456  InstructionShape::kStrided / LdsShape::kStrided
457  >;
458  };
459 
460 private:
461 
463  static_assert(kOpDelta == 1,
464  "Alternative arrangements not supported at present.");
465 
468 
469 public:
470 
471  //
472  // Derived quantities
473  //
474 
476  using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
477 
478 private:
479 
481  Index stride_;
482 
484  AccessType const *pointer_;
485 
487  Index byte_offset_;
488 
489 public:
490 
493  MmaVoltaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
494 
496  CUTLASS_DEVICE
498  TensorRef const &ref,
499  int lane_id
500  ):
501  stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
502 
503  // swizzle pattern is (tid & (3 << 3) | (tid[1:0] ^ tid[4:3]))
504  int access_strided = (lane_id >> 3) & 0x3;
505  int access_contiguous = ((lane_id ^ (lane_id >> 3)) & 0x3);
506 
507  pointer_ = reinterpret_cast<AccessType const *>(ref.data()) +
508  access_contiguous + access_strided * stride_;
509 
510  }
511 
513  CUTLASS_DEVICE
515 
516  byte_offset_ += offset * sizeof(Element);
517 
518  return *this;
519  }
520 
524 
525  int contiguous_offset = tile_offset.contiguous();
526  int strided_offset = tile_offset.strided();
527 
528  int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
529  Layout::kElementsPerAccess +
530  contiguous_offset * Shape::kContiguous;
531 
532  add_pointer_offset(offset);
533 
534  return *this;
535  }
536 
538  CUTLASS_DEVICE
540  byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
541  Layout::kElementsPerAccess;
542 
543  return *this;
544  }
545 
549  byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
550  Layout::kElementsPerAccess;
551 
552  return *this;
553  }
554 
556  CUTLASS_DEVICE
558  add_tile_offset(tile_offset);
559  return *this;
560  }
561 
563  CUTLASS_DEVICE
565  add_tile_offset(-tile_offset);
566  return *this;
567  }
568 
571  void load(Fragment &frag) const {
572 
573  load_with_byte_offset(frag, 0);
574  }
575 
577  CUTLASS_DEVICE
580  Fragment &frag,
582  Index byte_offset) const {
583 
584  AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);
585 
587  for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
588 
590  for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
591 
592  int access_idx = c + s * Policy::LdsIterations::kContiguous;
593 
594  AccessType const *source_ptr = pointer_ +
595  Policy::LdsShape::kContiguous / Layout::kElementsPerAccess * c +
596  Policy::LdsShape::kStrided * s * stride_;
597 
598  char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
599  fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
600  }
601  }
602  }
603 
605  CUTLASS_DEVICE
608  Fragment &frag,
610  Index pointer_offset) const {
611  load_with_byte_offset(frag, pointer_offset * sizeof(Element));
612  }
613 
615  CUTLASS_DEVICE
616  void load(
618  Fragment &frag,
620  TensorCoord const &tile_offset) const {
621  load_with_byte_offset(frag, tile_offset, 0);
622  }
623 
625  CUTLASS_DEVICE
626  void load(
628  Fragment &frag,
630  TensorCoord const &tile_offset,
632  Index pointer_offset) const {
633  load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
634  }
635 
637  CUTLASS_DEVICE
640  Fragment &frag,
642  TensorCoord const &tile_offset,
644  Index byte_offset) const {
645  Index pointer_offset =
646  tile_offset.contiguous() * Shape::kContiguous /
647  Layout::kElementsPerAccess +
648  tile_offset.strided() * InstructionShape::kStrided * stride_;
649 
650  byte_offset += sizeof(AccessType) * pointer_offset;
651 
652  load_with_byte_offset(frag, byte_offset);
653  }
654 
662  CUTLASS_DEVICE
663  void set_kgroup_index(int k_group) {
664  // no operation here
665  }
666 };
667 
669 
676 template <
678  typename Shape_,
680  typename Element_,
682  typename InstructionShape_,
685  int OpDelta_>
687  Shape_, Operand::kA, Element_,
689  sizeof_bits<Element_>::value>,
690  InstructionShape_, OpDelta_, 32> {
691  public:
692 
694  using Shape = Shape_;
695 
697  static Operand const kOperand = Operand::kA;
698 
700  using Element = Element_;
701 
704 
706  using InstructionShape = InstructionShape_;
707 
709  static int const kOpDelta = OpDelta_;
710 
712  static int const kThreads = 32;
713 
716 
718  using Index = typename TensorRef::Index;
719 
721  using LongIndex = typename TensorRef::LongIndex;
722 
725 
730  layout::PitchLinearShape<InstructionShape::kRow,
731  InstructionShape::kColumn>,
732  kOpDelta, kThreads>;
733 
734  public:
735 
736  //
737  // Derived quantities
738  //
739 
741  using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
742 
743 private:
744 
746  Base iterator_;
747 
748 public:
749 
753 
757  TensorRef const &ref,
758  int lane_id
759  ): iterator_({ref.data(), ref.stride()}, lane_id) {
760  }
761 
765 
766  iterator_.add_pointer_offset(offset);
767 
768  return *this;
769  }
770 
774 
775  iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
776 
777  return *this;
778  }
779 
783 
784  ++iterator_;
785 
786  return *this;
787  }
788 
792 
793  --iterator_;
794 
795  return *this;
796  }
797 
799  CUTLASS_DEVICE
801  add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
802  return *this;
803  }
804 
806  CUTLASS_DEVICE
808  add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
809  return *this;
810  }
811 
814  void load(Fragment &frag) const {
815 
816  iterator_.load(frag);
817  }
818 
820  CUTLASS_DEVICE
823  Fragment &frag,
825  Index pointer_offset) const {
826  iterator_.load_with_pointer_offset(frag, pointer_offset);
827  }
828 
830  CUTLASS_DEVICE
833  Fragment &frag,
835  Index byte_offset) const {
836  iterator_.load_with_byte_offset(frag, byte_offset);
837  }
838 
840  CUTLASS_DEVICE
841  void load(
843  Fragment &frag,
845  TensorCoord const &tile_offset) const {
846  // TODO
847  }
848 
850  CUTLASS_DEVICE
851  void load(
853  Fragment &frag,
855  TensorCoord const &tile_offset,
857  Index pointer_offset) const {
858  // TODO
859  }
860 
862  CUTLASS_DEVICE
865  Fragment &frag,
867  TensorCoord const &tile_offset,
869  Index byte_offset) const {
870  iterator_.load_with_byte_offset(
871  frag,
872  {tile_offset.contiguous(), tile_offset.strided()},
873  byte_offset);
874  }
875 
883  CUTLASS_DEVICE
884  void set_kgroup_index(int k_group) {
885  iterator_.set_kgroup_index(k_group);
886  }
887 };
888 
890 
897 template <
899  typename Shape_,
901  typename Element_,
903  typename InstructionShape_,
906  int OpDelta_>
908  Shape_, Operand::kB, Element_,
910  sizeof_bits<Element_>::value>,
911  InstructionShape_, OpDelta_, 32> {
912  public:
913 
915  using Shape = Shape_;
916 
918  static Operand const kOperand = Operand::kB;
919 
920  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
921  "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
922 
924  using Element = Element_;
925 
928 
930  using InstructionShape = InstructionShape_;
931 
933  static int const kOpDelta = OpDelta_;
934 
936  static int const kThreads = 32;
937 
940 
942  using Index = typename TensorRef::Index;
943 
945  using LongIndex = typename TensorRef::LongIndex;
946 
949 
954  layout::PitchLinearShape<InstructionShape::kColumn,
955  InstructionShape::kRow>,
956  kOpDelta, kThreads>;
957 
958  public:
959 
960  //
961  // Derived quantities
962  //
963 
965  using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
966 
967 private:
968 
970  Base iterator_;
971 
972 public:
973 
977 
981  TensorRef const &ref,
982  int lane_id
983  ): iterator_({ref.data(), ref.stride()}, lane_id) {
984  }
985 
989 
990  iterator_.add_pointer_offset(offset);
991 
992  return *this;
993  }
994 
998 
999  iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
1000 
1001  return *this;
1002  }
1003 
1007 
1008  ++iterator_;
1009 
1010  return *this;
1011  }
1012 
1016 
1017  --iterator_;
1018 
1019  return *this;
1020  }
1021 
1023  CUTLASS_DEVICE
1025  add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
1026  return *this;
1027  }
1028 
1030  CUTLASS_DEVICE
1032  add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
1033  return *this;
1034  }
1035 
1038  void load(Fragment &frag) const {
1039 
1040  iterator_.load(frag);
1041  }
1042 
1044  CUTLASS_DEVICE
1047  Fragment &frag,
1049  Index pointer_offset) const {
1050  iterator_.load_with_pointer_offset(frag, pointer_offset);
1051  }
1052 
1054  CUTLASS_DEVICE
1057  Fragment &frag,
1059  Index byte_offset) const {
1060  iterator_.load_with_byte_offset(frag, byte_offset);
1061  }
1062 
1064  CUTLASS_DEVICE
1065  void load(
1067  Fragment &frag,
1069  TensorCoord const &tile_offset) const {
1070  // TODO
1071  }
1072 
1074  CUTLASS_DEVICE
1075  void load(
1077  Fragment &frag,
1079  TensorCoord const &tile_offset,
1081  Index pointer_offset) const {
1082  // TODO
1083  }
1084 
1086  CUTLASS_DEVICE
1089  Fragment &frag,
1091  TensorCoord const &tile_offset,
1093  Index byte_offset) const {
1094  iterator_.load_with_byte_offset(
1095  frag,
1096  {tile_offset.strided(), tile_offset.contiguous()},
1097  byte_offset);
1098  }
1099 
1107  CUTLASS_DEVICE
1108  void set_kgroup_index(int k_group) {
1109  iterator_.set_kgroup_index(k_group);
1110  }
1111 };
1112 
1114 
1123 template <
1125  typename Shape_,
1127  typename Element_,
1129  typename Layout_,
1131  typename InstructionShape_,
1134  typename OpDelta_>
1136  public:
1137 
1139  using Shape = Shape_;
1140 
1142  static Operand const kOperand = Operand::kC;
1143 
1145  using Element = Element_;
1146 
1148  using Layout = Layout_;
1149 
1151  using InstructionShape = InstructionShape_;
1152 
1154  using OpDelta = OpDelta_;
1155 
1157  static int const kThreads = 32;
1158 
1161 
1163  using Index = typename TensorRef::Index;
1164 
1167 
1170 
1172  struct Policy {
1173 
1176 
1177  static_assert(!(Shape::kRow % InterleavedTile::kRow) && !(Shape::kColumn % InterleavedTile::kColumn),
1178  "Shape of warp-level Mma must be divisible by operator shape.");
1179 
1181  "Layouts must be defined for logical MatrixCoord coordinate space.");
1182 
1184  using TileIterations = MatrixShape<
1185  Shape::kRow / InterleavedTile::kRow,
1186  Shape::kColumn / InterleavedTile::kColumn
1187  >;
1188 
1189  using MmaIterations =
1190  MatrixShape<InterleavedTile::kRow / InstructionShape::kM,
1191  InterleavedTile::kColumn / InstructionShape::kN>;
1192  };
1193 
1194 private:
1195 
1196  // Assume accumulator tile is multipile interleaved 32x32 tile.
1197  static int const kElementsPerPartial = 4;
1198  using EleShapePerPatial = typename platform::conditional<
1201  MatrixShape<1, 4> >::type;
1202  static int const kElementsPerMma = 8;
1203  static int const kAccumulatorPatials = 2;
1205 
1206 public:
1207 
1208  //
1209  // Derived quantities
1210  //
1211 
1213  using Fragment = Array<Element, Shape::kCount / kThreads>;
1214 
1215 private:
1216 
1218  TensorRef ref_;
1219 
1220 public:
1221 
1225 
1229  TensorRef const &ref,
1230  int lane_id
1231  ):
1232  ref_(ref) {
1233 
1234  int quad = (lane_id >> 2);
1235  int lane_in_quad = (lane_id & 3);
1236  int accum_m, accum_n;
1237 
1238  if (platform::is_same<Element, float>::value) {
1239  // (quad[2],quad[0])+lane_in_quad[0]
1240  accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
1241  // (quad[1])+lane_in_quad[1]
1242  accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
1243  (lane_in_quad & 2);
1244  } else {
1245  accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + lane_in_quad; // (quad[2],quad[0])
1246  accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
1247  }
1248  MatrixCoord lane_offset(accum_m, accum_n);
1249 
1250  ref_.add_coord_offset(lane_offset);
1251  }
1252 
1256  ref_.add_pointer_offset(offset);
1257  return *this;
1258  }
1259 
1263 
1264  ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
1265 
1266  return *this;
1267  }
1268 
1272  // deliberate no-op
1273  return *this;
1274  }
1275 
1279  // deliberate no-op
1280  return *this;
1281  }
1282 
1284  CUTLASS_DEVICE
1286  add_tile_offset(tile_offset);
1287  return *this;
1288  }
1289 
1291  CUTLASS_DEVICE
1293  add_tile_offset(-tile_offset);
1294  return *this;
1295  }
1296 
1299  void load(Fragment &frag) const {
1300  load_with_pointer_offset(frag, 0);
1301  }
1302 
1306  Fragment &frag,
1307  Index pointer_offset) const {
1308 
1309  TensorRef offset_ref(ref_);
1310  offset_ref.add_pointer_offset(pointer_offset);
1311 
1313  for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
1315  for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
1317  for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
1319  for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
1320 
1321  int mma_accum_start =
1322  (((tile_n * Policy::TileIterations::kRow + tile_m) *
1323  Policy::MmaIterations::kColumn + mma_n) *
1324  Policy::MmaIterations::kRow + mma_m) *
1325  kElementsPerMma;
1326 
1328  for (int p = 0; p < kAccumulatorPatials; ++p) {
1330  for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
1332  for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
1333  int accum_m = tile_m * Policy::InterleavedTile::kRow +
1334  mma_m * QuadShapePerPatialMma::kRow + m * 2;
1335  int accum_n = tile_n * Policy::InterleavedTile::kColumn +
1336  mma_n * QuadShapePerPatialMma::kColumn +
1337  p * Policy::InterleavedTile::kColumn/2 + n;
1338  int idx = mma_accum_start + p * kElementsPerPartial +
1339  m * EleShapePerPatial::kColumn + n;
1340  frag[idx] = offset_ref.at({accum_m, accum_n});
1341  }
1342  }
1343  }
1344  }
1345  }
1346  }
1347  }
1348  }
1350  CUTLASS_DEVICE
1352  Fragment &frag,
1353  Index byte_offset) const {
1354 
1355  load_with_pointer_offset(byte_offset / sizeof(Element));
1356  }
1357 
1360  void load(
1361  Fragment &frag,
1362  TensorCoord const &tile_offset) const {
1363 
1364  load(frag, tile_offset, 0);
1365  }
1366 
1369  void load(
1370  Fragment &frag,
1371  TensorCoord const &tile_offset,
1372  Index pointer_offset) const {
1373 
1374  load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
1375  }
1376 
1379  void store(Fragment const &frag) const {
1380  store_with_pointer_offset(frag, 0);
1381  }
1382 
1386  Fragment const &frag,
1387  Index pointer_offset) const {
1388 
1389  TensorRef offset_ref(ref_);
1390  offset_ref.add_pointer_offset(pointer_offset);
1391 
1393  for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
1395  for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
1397  for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
1399  for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
1400 
1401  int mma_accum_start =
1402  (((tile_n * Policy::TileIterations::kRow + tile_m) *
1403  Policy::MmaIterations::kColumn + mma_n) *
1404  Policy::MmaIterations::kRow + mma_m) *
1405  kElementsPerMma;
1406 
1408  for (int p = 0; p < kAccumulatorPatials; ++p) {
1410  for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
1412  for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
1413  int accum_m = tile_m * Policy::InterleavedTile::kRow +
1414  mma_m * QuadShapePerPatialMma::kRow + m * 2;
1415  int accum_n = tile_n * Policy::InterleavedTile::kColumn +
1416  mma_n * QuadShapePerPatialMma::kColumn +
1417  p * Policy::InterleavedTile::kColumn/2 + n;
1418  int idx = mma_accum_start + p * kElementsPerPartial +
1419  m * EleShapePerPatial::kColumn + n;
1420  offset_ref.at({accum_m, accum_n}) = frag[idx];
1421  }
1422  }
1423  }
1424  }
1425  }
1426  }
1427  }
1428  }
1429 
1433  Fragment const &frag,
1434  Index byte_offset) const {
1435 
1436  store_with_pointer_offset(byte_offset / sizeof(Element));
1437  }
1438 
1441  void store(
1442  Fragment &frag,
1443  TensorCoord const &tile_offset) const {
1444 
1445  store(frag, tile_offset, 0);
1446  }
1447 
1450  void store(
1452  Fragment const &frag,
1454  TensorCoord const &tile_offset,
1456  Index pointer_offset) const {
1457  store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
1458  }
1459 };
1460 
1468 template <
1470  typename Shape_,
1472  Operand Operand_,
1474  typename Element_,
1476  typename InstructionShape_,
1479  int OpDelta_,
1481  int KBlock>
1483  Shape_, Operand_, Element_,
1485  sizeof_bits<Element_>::value, KBlock>,
1486  InstructionShape_, OpDelta_, 32> {
1487  public:
1489  using Shape = Shape_;
1490 
1492  static Operand const kOperand = Operand_;
1493 
1494  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
1495  "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for "
1496  "A or B operands to warp-level Mma.");
1497 
1499  using Element = Element_;
1500 
1502  static int const kKBlock = KBlock;
1503 
1507 
1509  using InstructionShape = InstructionShape_;
1510 
1513  static int const kOpDelta = OpDelta_;
1514 
1516  static int const kThreads = 32;
1517 
1520 
1522  using Index = typename TensorRef::Index;
1523 
1526 
1529 
1531  struct Policy {
1532 
1535 
1537  using LdsIterations = layout::PitchLinearShape<1, Shape::kStrided / 32>;
1538 
1540  static int const kElementsPerAccess = 8;
1541 
1543  static int const kContiguousElementsPerLine = 4;
1544  };
1545 
1546  private:
1548  static_assert(kOpDelta == 1,
1549  "Alternative arrangements not supported at present.");
1550 
1553 
1554  public:
1555  //
1556  // Derived quantities
1557  //
1558 
1560  using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
1561 
1562  private:
1563 
1565  Index stride_;
1566 
1568  AccessType const *pointer_;
1569 
1571  Index byte_offset_;
1572 
1575  Index line_size;
1576 
1579  int k_group_idx_;
1580 
1581  public:
1585  : pointer_(nullptr),
1586  stride_(0),
1587  line_size(0),
1588  byte_offset_(0),
1589  k_group_idx_(0) {}
1590 
1592  CUTLASS_DEVICE
1594  : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
1595  stride_(ref.stride(0) * Policy::kElementsPerAccess),
1596  line_size((ref.stride(0) * Policy::kContiguousElementsPerLine) /
1597  Policy::kElementsPerAccess),
1598  k_group_idx_(0),
1599  byte_offset_(0) {
1600 
1601  int quad = (lane_id / 4);
1602  int lane_in_quad = (lane_id % 4);
1603  int access_contiguous;
1604 
1605  if(kOperand == Operand::kA) {
1606 
1607  // swizzle id: tid[4]|tid[1:0]|(tid[2]^tid[4])
1608  access_contiguous = ((quad & 0x4) << 1) + ((lane_in_quad) << 1) +
1609  ((quad & 0x1) ^ ((quad & 0x4) >> 2));
1610  } else {
1611 
1612  // swizzle id: tid[4]|tid[1:0]|tid[3]
1613  access_contiguous = ((quad & 0x4) << 1) + (lane_in_quad << 1) +
1614  ((quad & 0x2) >> 1 ^ ((quad & 0x4) >> 2));
1615  }
1616 
1617  byte_offset_ = access_contiguous *
1618  sizeof(Element) * Policy::kElementsPerAccess;
1619  }
1620 
1622  CUTLASS_DEVICE
1624  byte_offset_ += offset * sizeof(Element);
1625 
1626  return *this;
1627  }
1628 
1631  CUTLASS_DEVICE
1633  TensorCoord const &tile_offset) {
1634 
1635  int contiguous_offset = tile_offset.contiguous();
1636  int strided_offset = tile_offset.strided();
1637  k_group_idx_ = 0;
1638 
1639  pointer_ += contiguous_offset *
1640  (InstructionShape::kContiguous /
1641  Policy::kContiguousElementsPerLine) *
1642  line_size +
1643  strided_offset * Shape::kStrided / 2;
1644  return *this;
1645  }
1646 
1648  CUTLASS_DEVICE
1650  k_group_idx_ = (k_group_idx_ + 1) % 8;
1651 
1652  if (k_group_idx_ == 4 || k_group_idx_ == 0) {
1653  byte_offset_ ^= 1 * sizeof(Element) * Policy::kElementsPerAccess;
1654  }
1655 
1656  pointer_ += line_size;
1657  return *this;
1658  }
1659 
1663 
1666  CUTLASS_DEVICE
1668  TensorCoord const &tile_offset) {
1669  add_tile_offset(tile_offset);
1670  return *this;
1671  }
1672 
1675  CUTLASS_DEVICE
1677  TensorCoord const &tile_offset) {
1678  add_tile_offset(-tile_offset);
1679  return *this;
1680  }
1681 
1684  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
1685 
1687  CUTLASS_DEVICE
1690  Fragment &frag,
1692  Index byte_offset) const {
1693 
1694  AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);
1695 
1697  for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
1698 
1700  for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
1701 
1702  int access_idx = c + s * Policy::LdsIterations::kContiguous;
1703 
1704  AccessType const *source_ptr = pointer_ +
1705  Policy::LdsShape::kContiguous * c * line_size +
1706  Policy::LdsShape::kStrided * s / 2;
1707 
1708  char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
1709  fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
1710 
1711  // swap higher 64bit and lower 64bit
1712  if (k_group_idx_ & 0x2) {
1713  uint64_t *low = reinterpret_cast<uint64_t *>(&frag) + access_idx * 2;
1714  uint64_t *high = reinterpret_cast<uint64_t *>(&frag) + access_idx * 2 + 1;
1715  uint64_t tmp = *low;
1716  *low = *high;
1717  *high = tmp;
1718  }
1719  }
1720  }
1721  }
1722 
1724  CUTLASS_DEVICE
1727  Fragment &frag,
1729  Index pointer_offset) const {
1730  load_with_byte_offset(frag, pointer_offset * sizeof(Element));
1731  }
1732 
1734  CUTLASS_DEVICE
1735  void load(
1737  Fragment &frag,
1739  TensorCoord const &tile_offset) const {
1740  load_with_byte_offset(frag, tile_offset, 0);
1741  }
1742 
1744  CUTLASS_DEVICE
1745  void load(
1747  Fragment &frag,
1749  TensorCoord const &tile_offset,
1751  Index pointer_offset) const {
1752  load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
1753  }
1754 
1756  CUTLASS_DEVICE
1759  Fragment &frag,
1761  TensorCoord const &tile_offset,
1763  Index byte_offset) const {
1764  Index pointer_offset = tile_offset.contiguous() *
1765  InstructionShape::kContiguous /
1766  Policy::kElementsPerAccess +
1767  tile_offset.strided() * Shape::kStrided * stride_;
1768 
1769  byte_offset += sizeof(AccessType) * pointer_offset;
1770 
1771  load_with_byte_offset(frag, byte_offset);
1772  }
1773 
1781  CUTLASS_DEVICE
1782  void set_kgroup_index(int k_group) {
1783  k_group_idx_ = k_group;
1784  }
1785 };
1786 
1794 template <
1796  typename Shape_,
1798  Operand Operand_,
1800  typename Element_,
1802  typename InstructionShape_,
1805  int OpDelta_,
1807  int KBlock>
1809  Shape_, Operand_, Element_,
1811  sizeof_bits<Element_>::value, KBlock>,
1812  InstructionShape_, OpDelta_, 32> {
1813  public:
1815  using Shape = Shape_;
1816 
1818  static Operand const kOperand = Operand_;
1819 
1820  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
1821  "MmaTensorOpMultiplicandIterator may only be instantiated for "
1822  "A or B operands to warp-level Mma.");
1823 
1825  using Element = Element_;
1826 
1828  static int const kKBlock = KBlock;
1829 
1830 
1834 
1836  using InstructionShape = InstructionShape_;
1837 
1840  static int const kOpDelta = OpDelta_;
1841 
1843  static int const kThreads = 32;
1844 
1847 
1849  using Index = typename TensorRef::Index;
1850 
1853 
1856 
1861  kKBlock>,
1862  layout::PitchLinearShape<InstructionShape::kRow,
1863  InstructionShape::kColumn>,
1864  kOpDelta, kThreads>;
1865 
1866  public:
1867  //
1868  // Derived quantities
1869  //
1870 
1872  using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
1873 
1874  private:
1876  Base iterator_;
1877 
1878  public:
1882 
1886  : iterator_({ref.data(), ref.stride()}, lane_id) {}
1887 
1891  iterator_.add_pointer_offset(offset);
1892 
1893  return *this;
1894  }
1895 
1900  TensorCoord const &tile_offset) {
1901  iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
1902 
1903  return *this;
1904  }
1905 
1909  ++iterator_;
1910 
1911  return *this;
1912  }
1913 
1917  --iterator_;
1918 
1919  return *this;
1920  }
1921 
1924  CUTLASS_DEVICE
1926  TensorCoord const &tile_offset) {
1927  add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
1928  return *this;
1929  }
1930 
1933  CUTLASS_DEVICE
1935  TensorCoord const &tile_offset) {
1936  add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
1937  return *this;
1938  }
1939 
1942  void load(Fragment &frag) const { iterator_.load(frag); }
1943 
1945  CUTLASS_DEVICE
1948  Fragment &frag,
1950  Index pointer_offset) const {
1951  iterator_.load_with_pointer_offset(frag, pointer_offset);
1952  }
1953 
1955  CUTLASS_DEVICE
1958  Fragment &frag,
1960  Index byte_offset) const {
1961  iterator_.load_with_byte_offset(frag, byte_offset);
1962  }
1963 
1965  CUTLASS_DEVICE
1966  void load(
1968  Fragment &frag,
1970  TensorCoord const &tile_offset) const {
1971  // TODO
1972  assert(0);
1973  }
1974 
1976  CUTLASS_DEVICE
1977  void load(
1979  Fragment &frag,
1981  TensorCoord const &tile_offset,
1983  Index pointer_offset) const {
1984  // TODO
1985  assert(0);
1986  }
1987 
1989  CUTLASS_DEVICE
1992  Fragment &frag,
1994  TensorCoord const &tile_offset,
1996  Index byte_offset) const {
1997  iterator_.load_with_byte_offset(
1998  frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
1999  }
2000 
2008  CUTLASS_DEVICE
2009  void set_kgroup_index(int k_group) {
2010  iterator_.set_kgroup_index(k_group);
2011  }
2012 };
2013 
2015 
2023 template <
2025  typename Shape_,
2027  Operand Operand_,
2029  typename Element_,
2031  typename InstructionShape_,
2034  int OpDelta_,
2036  int KBlock>
2038  Shape_, Operand_, Element_,
2040  sizeof_bits<Element_>::value, KBlock>,
2041  InstructionShape_, OpDelta_, 32> {
2042  public:
2044  using Shape = Shape_;
2045 
2047  static Operand const kOperand = Operand_;
2048 
2049  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
2050  "MmaTensorOpMultiplicandIterator may only be instantiated for "
2051  "A or B operands to warp-level Mma.");
2052 
2054  using Element = Element_;
2055 
2057  static int const kKBlock = KBlock;
2058 
2062 
2064  using InstructionShape = InstructionShape_;
2065 
2068  static int const kOpDelta = OpDelta_;
2069 
2071  static int const kThreads = 32;
2072 
2075 
2077  using Index = typename TensorRef::Index;
2078 
2081 
2084 
2089  kKBlock>,
2090  layout::PitchLinearShape<InstructionShape::kColumn,
2091  InstructionShape::kRow>,
2092  kOpDelta, kThreads>;
2093 
2094  public:
2095  //
2096  // Derived quantities
2097  //
2098 
2100  using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
2101 
2102  private:
2104  Base iterator_;
2105 
2106  public:
2110 
2114  : iterator_({ref.data(), ref.stride()}, lane_id) {}
2115 
2119  iterator_.add_pointer_offset(offset);
2120 
2121  return *this;
2122  }
2123 
2128  TensorCoord const &tile_offset) {
2129  iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
2130 
2131  return *this;
2132  }
2133 
2137  ++iterator_;
2138 
2139  return *this;
2140  }
2141 
2145  --iterator_;
2146 
2147  return *this;
2148  }
2149 
2152  CUTLASS_DEVICE
2154  TensorCoord const &tile_offset) {
2155  add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
2156  return *this;
2157  }
2158 
2161  CUTLASS_DEVICE
2163  TensorCoord const &tile_offset) {
2164  add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
2165  return *this;
2166  }
2167 
2170  void load(Fragment &frag) const { iterator_.load(frag); }
2171 
2173  CUTLASS_DEVICE
2176  Fragment &frag,
2178  Index pointer_offset) const {
2179  iterator_.load_with_pointer_offset(frag, pointer_offset);
2180  }
2181 
2183  CUTLASS_DEVICE
2186  Fragment &frag,
2188  Index byte_offset) const {
2189  iterator_.load_with_byte_offset(frag, byte_offset);
2190  }
2191 
2193  CUTLASS_DEVICE
2194  void load(
2196  Fragment &frag,
2198  TensorCoord const &tile_offset) const {
2199  // TODO
2200  assert(0);
2201  }
2202 
2204  CUTLASS_DEVICE
2205  void load(
2207  Fragment &frag,
2209  TensorCoord const &tile_offset,
2211  Index pointer_offset) const {
2212  // TODO
2213  assert(0);
2214  }
2215 
2217  CUTLASS_DEVICE
2220  Fragment &frag,
2222  TensorCoord const &tile_offset,
2224  Index byte_offset) const {
2225  iterator_.load_with_byte_offset(
2226  frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
2227  }
2228 
2236  CUTLASS_DEVICE
2237  void set_kgroup_index(int k_group) {
2238  iterator_.set_kgroup_index(k_group);
2239  }
2240 };
2241 
2242 } // namespace warp
2243 } // namespace gemm
2244 } // namespace cutlass
2245 
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1369
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:2194
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:630
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:254
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:353
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:539
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1166
Describes the size of a matrix tile.
Definition: matrix_shape.h:42
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:821
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1151
Definition: mma_tensor_op_tile_iterator_sm70.h:70
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:626
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Stores a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1450
Definition: aligned_buffer.h:35
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1045
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:965
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:2170
Defines a structure containing strides, bounds, and a pointer to tensor data.
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:606
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator_sm70.h:1154
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:1560
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
std::is_same (false specialization)
Definition: platform.h:394
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1224
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1385
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:1024
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1299
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:219
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:387
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1725
Operand
GEMM operand enumeration: D = A * B + C.
Definition: include/cutlass/gemm/gemm.h:39
Aligned array type.
Definition: array.h:511
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1278
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1360
Defines common types used for all GEMM-like operators.
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:782
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1262
Definition: tensor_op_multiplicand_sm70.h:848
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:2174
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1087
C++ features that may be otherwise unimplemented for CUDA device functions.
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator_sm70.h:1228
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1956
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator_sm70.h:1169
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1688
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1271
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:1213
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:523
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator_sm70.h:1139
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:286
CUTLASS_HOST_DEVICE TensorRef & add_coord_offset(TensorCoord const &coord)
Adds an offset to each pointer.
Definition: tensor_ref.h:326
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:1255
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:1379
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:791
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1966
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile, needs on more time number of registers.
Definition: mma_tensor_op_tile_iterator_sm70.h:476
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1006
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:741
Element_ Element
Element type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1145
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277
typename Layout::TensorCoord TensorCoord
Coordinate in logical tensor space.
Definition: tensor_ref.h:171
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:168
CUTLASS_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1292
Defines a Shape template for matrix tiles.
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:263
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1038
Defines the size of an element in bits.
Definition: numeric_types.h:42
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:831
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1055
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:228
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:548
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:616
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:841
#define nullptr
nullptr
Definition: platform.h:144
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1684
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:331
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:293
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:1890
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1065
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:800
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:2118
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1305
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1757
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:397
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1662
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:997
Top-level include for all CUTLASS numeric types.
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1015
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:863
std::conditional (true specialization)
Definition: platform.h:325
#define static_assert(__e, __m)
Definition: platform.h:153
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:764
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:2144
CUTLASS_HOST_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const
Stores a fragment to memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1441
Internal structure of iterator - made public to enable introspection.
Definition: mma_tensor_op_tile_iterator_sm70.h:1172
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:321
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:638
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:514
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:2205
Definition: tensor_op_multiplicand_sm70.h:733
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:988
Definition: tensor_op_multiplicand_sm70.h:943
typename Layout::Index Index
Index type.
Definition: tensor_ref.h:165
typename TensorRef::Index Index
Index type.
Definition: mma_tensor_op_tile_iterator_sm70.h:1163
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1351
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:341
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:851
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:571
CUTLASS_HOST_DEVICE Reference at(TensorCoord const &coord) const
Returns a reference to the element at a given Coord.
Definition: tensor_ref.h:307
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:2184
Definition: mma_tensor_op_tile_iterator_sm70.h:1135
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:578
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator_sm70.h:1623
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1735
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1977
Defines layout functions used by TensorRef and derived classes.
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:2100
CUTLASS_HOST_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1432
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1745
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:814
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:773
CUTLASS_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:1285
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator_sm70.h:1942
CUTLASS_HOST_DEVICE TensorRef & add_pointer_offset(LongIndex offset_)
Adds an offset to each pointer.
Definition: tensor_ref.h:319
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:272
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:191
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1990
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator_sm70.h:1946
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:557
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:2136
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1649
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:1075
Layout_ Layout
Layout of source tile.
Definition: mma_tensor_op_tile_iterator_sm70.h:1148
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1916
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:60
Basic include for CUTLASS.
Definition: matrix_coord.h:39
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator_sm70.h:1908
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator_sm70.h:2218
typename Layout::LongIndex LongIndex
Long index used for pointer offsets.
Definition: tensor_ref.h:168