CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
mma_tensor_op_tile_iterator.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/cutlass.h"
32 
33 #include "cutlass/array.h"
34 #include "cutlass/numeric_types.h"
35 #include "cutlass/tensor_ref.h"
36 #include "cutlass/matrix_shape.h"
37 
39 #include "cutlass/gemm/gemm.h"
40 
41 #include "cutlass/layout/matrix.h"
42 #include "cutlass/layout/tensor.h"
45 
47 #include "cutlass/fast_math.h"
48 
50 
51 namespace cutlass {
52 namespace gemm {
53 namespace warp {
54 
56 
57 template <
59  typename Shape_,
63  typename Element_,
65  typename Layout_,
67  typename InstructionShape_,
70  int OpDelta_,
72  int Threads,
74  int PartitionsK_ = 1>
76 
78 
85 template <
87  typename Shape_,
89  Operand Operand_,
91  typename Element_,
93  typename InstructionShape_,
96  int OpDelta_,
98  int PartitionsK_>
100  Shape_, Operand_, Element_,
101  cutlass::layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
102  64>,
103  InstructionShape_, OpDelta_, 32, PartitionsK_> {
104  public:
105 
107  using Shape = Shape_;
108 
110  static Operand const kOperand = Operand_;
111 
112  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
113  "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
114 
116  using Element = Element_;
117 
121 
123  using InstructionShape = InstructionShape_;
124 
126  static int const kOpDelta = OpDelta_;
127 
129  static int const kThreads = 32;
130 
132  static int const kPartitionsK = PartitionsK_;
133 
136 
138  using Index = typename TensorRef::Index;
139 
141  using LongIndex = typename TensorRef::LongIndex;
142 
145 
147  struct Policy {
149  !(Shape::kContiguous % InstructionShape::kContiguous),
150  "Shape of warp-level Mma must be divisible by operator shape.");
151 
152  // Determine number of elements along outer dimension per individual LDSM op
153  static int const kLdsmOpOuter = Layout::kElementsPerAccess;
154  static int const kLdsmOpInner = 8;
155 
156  static_assert(!(Shape::kContiguous % kLdsmOpOuter),
157  "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
158 
159  static_assert(!(Shape::kStrided % kLdsmOpInner),
160  "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
161 
163  static int const LdsmShapeStrided =
164  InstructionShape::kStrided / kLdsmOpInner;
165  static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
166  using LdsmShape =
168 
171  Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
172  1>;
173 
175  static int const kGroupsPerTile =
176  Shape::kStrided / InstructionShape::kStrided;
177  };
178 
179 private:
180 
182  static_assert(kOpDelta == 1,
183  "Alternative arrangements not supported at present.");
184 
186  static int const kPointerCount =
187  Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous;
188 
190  using AccessType = Array<Element, Layout::kElementsPerAccess>;
191 
193  int k_group_idx_;
194 
195 public:
196 
197  //
198  // Derived quantities
199  //
200 
202  using Fragment = Array<Element, Shape::kCount / kThreads>;
203 
204 private:
205 
207  Index stride_;
208 
210  AccessType const *pointer_[kPointerCount];
211 
213  Index byte_offset_;
214 
215 public:
216 
219  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
220 
222  CUTLASS_DEVICE
224  TensorRef const &ref,
225  int lane_id
226  ):
227  stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0),
228  k_group_idx_(0) {
229 
230  int quad_pair = (lane_id >> 3);
231  int lane_in_quad = (lane_id & 3);
232  int lane_in_quad_pair = (lane_id & 7);
234  for (int i = 0; i < kPointerCount; ++i) {
235  int partition_contiguous_idx = -1;
236  int access_contiguous_idx = -1;
237  int access_strided_idx = -1;
238 
239  if (Policy::LdsmShape::kContiguous == 4) {
240  partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ i);
241  access_contiguous_idx = (quad_pair ^ lane_in_quad);
242  access_strided_idx = lane_in_quad_pair;
243  }
244  int access_contiguous =
245  partition_contiguous_idx * Layout::PartitionShape::kContiguous +
246  access_contiguous_idx;
247 
248  int access_strided = access_strided_idx;
249 
250  pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
251  access_contiguous + access_strided * stride_;
252  }
253  }
254 
256  CUTLASS_DEVICE
258 
259  byte_offset_ += offset * sizeof(Element);
260 
261  return *this;
262  }
263 
267 
268  int contiguous_offset = tile_offset.contiguous();
269  if (Shape::kContiguous ==
270  Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
271  if (tile_offset.contiguous() % 2) {
273  for (int i = 0; i < kPointerCount / 2; ++i) {
274  AccessType const *tmp_pointer = pointer_[i];
275  pointer_[i] = pointer_[i + kPointerCount / 2];
276  pointer_[i + kPointerCount / 2] = tmp_pointer;
277  }
278  }
279  contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
280  }
281 
282  int offset = (tile_offset.strided() * InstructionShape::kStrided) *
283  stride_ * Layout::kElementsPerAccess +
284  contiguous_offset * Shape::kContiguous;
285 
286  add_pointer_offset(offset);
287 
288  return *this;
289  }
290 
292  CUTLASS_DEVICE
294 
295  add_tile_offset({0, 1});
296 
297  if (kPartitionsK > 1) {
298  ++k_group_idx_;
299  // Jump to next stage
300  if (k_group_idx_ == Policy::kGroupsPerTile) {
301  k_group_idx_ = 0;
302  add_tile_offset(
303  {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
304  }
305  }
306 
307  return *this;
308  }
309 
313  byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
314  Layout::kElementsPerAccess;
315 
316  return *this;
317  }
318 
320  CUTLASS_DEVICE
322  add_tile_offset(tile_offset);
323  return *this;
324  }
325 
327  CUTLASS_DEVICE
329  add_tile_offset(-tile_offset);
330  return *this;
331  }
332 
335  void load(Fragment &frag) const {
336 
337  load_with_byte_offset(frag, 0);
338  }
339 
341  CUTLASS_DEVICE
344  Fragment &frag,
346  Index byte_offset) const {
347 
348  Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
349  reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
350 
352  for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
353 
355  for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
356 
357  int access_idx = c + s * Policy::LdsmIterations::kContiguous;
358 
359  AccessType const *source_ptr =
360  pointer_[c % kPointerCount] +
361  Layout::TileShape::kContiguous * (c / kPointerCount) +
362  Policy::LdsmShape::kStrided * s * stride_;
363 
364  char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
365 
366  cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
367  fetch_ptr[access_idx],
368  source_byte_ptr
369  );
370  }
371  }
372  }
373 
375  CUTLASS_DEVICE
378  Fragment &frag,
380  Index pointer_offset) const {
381  load_with_byte_offset(frag, pointer_offset * sizeof(Element));
382  }
383 
385  CUTLASS_DEVICE
386  void load(
388  Fragment &frag,
390  TensorCoord const &tile_offset) const {
391  load_with_byte_offset(frag, tile_offset, 0);
392  }
393 
395  CUTLASS_DEVICE
396  void load(
398  Fragment &frag,
400  TensorCoord const &tile_offset,
402  Index pointer_offset) const {
403  load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
404  }
405 
407  CUTLASS_DEVICE
410  Fragment &frag,
412  TensorCoord const &tile_offset,
414  Index byte_offset) const {
415  Index pointer_offset =
416  tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess +
417  tile_offset.strided() * InstructionShape::kStrided * stride_;
418 
419  byte_offset += sizeof(AccessType) * pointer_offset;
420 
421  load_with_byte_offset(frag, byte_offset);
422  }
423 
431  CUTLASS_DEVICE
432  void set_kgroup_index(int k_group) {
433  // no op
434  }
435 };
436 
444 template <
446  typename Shape_,
448  Operand Operand_,
450  typename Element_,
452  typename InstructionShape_,
455  int OpDelta_,
457  int PartitionsK_>
459  Shape_, Operand_, Element_,
461  sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
462  InstructionShape_, OpDelta_, 32, PartitionsK_> {
463  public:
464 
466  using Shape = Shape_;
467 
469  static Operand const kOperand = Operand_;
470 
471  static_assert(kOperand == Operand::kA,
472  "MmaTensorOpMultiplicandIterator for ColumnMajor Congruous may "
473  "only be instantiated for A operand to warp-level Mma.");
474 
476  using Element = Element_;
477 
480  sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>;
481 
483  using InstructionShape = InstructionShape_;
484 
486  static int const kOpDelta = OpDelta_;
487 
489  static int const kThreads = 32;
490 
493 
495  using Index = typename TensorRef::Index;
496 
498  using LongIndex = typename TensorRef::LongIndex;
499 
502 
507  int(128 / sizeof(Element_))>,
508  layout::PitchLinearShape<InstructionShape::kRow,
509  InstructionShape::kColumn>,
510  kOpDelta, kThreads, PartitionsK_>;
511 
512  public:
513 
514  //
515  // Derived quantities
516  //
517 
519  using Fragment = Array<Element, Shape::kCount / kThreads>;
520 
521 private:
522 
524  Base iterator_;
525 
526 public:
527 
531 
535  TensorRef const &ref,
536  int lane_id
537  ): iterator_({ref.data(), ref.stride()}, lane_id) {
538  }
539 
543 
544  iterator_.add_pointer_offset(offset);
545 
546  return *this;
547  }
548 
552 
553  iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
554 
555  return *this;
556  }
557 
561 
562  ++iterator_;
563 
564  return *this;
565  }
566 
570 
571  --iterator_;
572 
573  return *this;
574  }
575 
577  CUTLASS_DEVICE
579  add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
580  return *this;
581  }
582 
584  CUTLASS_DEVICE
586  add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
587  return *this;
588  }
589 
592  void load(Fragment &frag) const {
593 
594  iterator_.load(frag);
595  }
596 
598  CUTLASS_DEVICE
601  Fragment &frag,
603  Index pointer_offset) const {
604  iterator_.load_with_pointer_offset(frag, pointer_offset);
605  }
606 
608  CUTLASS_DEVICE
611  Fragment &frag,
613  Index byte_offset) const {
614  iterator_.load_with_byte_offset(frag, byte_offset);
615  }
616 
618  CUTLASS_DEVICE
619  void load(
621  Fragment &frag,
623  TensorCoord const &tile_offset) const {
624  // TODO
625  }
626 
628  CUTLASS_DEVICE
629  void load(
631  Fragment &frag,
633  TensorCoord const &tile_offset,
635  Index pointer_offset) const {
636  // TODO
637  }
638 
640  CUTLASS_DEVICE
643  Fragment &frag,
645  TensorCoord const &tile_offset,
647  Index byte_offset) const {
648  iterator_.load_with_byte_offset(
649  frag,
650  {tile_offset.contiguous(), tile_offset.strided()},
651  byte_offset);
652  }
653 
661  CUTLASS_DEVICE
662  void set_kgroup_index(int k_group) {
663  iterator_.set_kgroup_index(k_group);
664  }
665 };
666 
668 
675 template <
677  typename Shape_,
679  Operand Operand_,
681  typename Element_,
683  typename InstructionShape_,
686  int OpDelta_,
688  int PartitionsK_>
690  Shape_, Operand_, Element_,
692  sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
693  InstructionShape_, OpDelta_, 32, PartitionsK_> {
694  public:
695 
697  using Shape = Shape_;
698 
700  static Operand const kOperand = Operand_;
701 
702  static_assert(kOperand == Operand::kB,
703  "MmaTensorOpMultiplicandIterator for RowMajor Congruous may "
704  "only be instantiated for B operand to warp-level Mma.");
705 
707  using Element = Element_;
708 
711  sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>;
712 
714  using InstructionShape = InstructionShape_;
715 
717  static int const kOpDelta = OpDelta_;
718 
720  static int const kThreads = 32;
721 
724 
726  using Index = typename TensorRef::Index;
727 
729  using LongIndex = typename TensorRef::LongIndex;
730 
733 
737  layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
738  int(128 / sizeof(Element_))>,
739  layout::PitchLinearShape<InstructionShape::kColumn,
740  InstructionShape::kRow>,
741  kOpDelta, kThreads, PartitionsK_>;
742 
743  public:
744 
745  //
746  // Derived quantities
747  //
748 
750  using Fragment = Array<Element, Shape::kCount / kThreads>;
751 
752 private:
753 
755  Base iterator_;
756 
757 public:
758 
762 
766  TensorRef const &ref,
767  int lane_id
768  ): iterator_({ref.data(), ref.stride()}, lane_id) {
769  }
770 
774 
775  iterator_.add_pointer_offset(offset);
776 
777  return *this;
778  }
779 
783 
784  iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
785 
786  return *this;
787  }
788 
792 
793  ++iterator_;
794 
795  return *this;
796  }
797 
801 
802  --iterator_;
803 
804  return *this;
805  }
806 
808  CUTLASS_DEVICE
810  add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
811  return *this;
812  }
813 
815  CUTLASS_DEVICE
817  add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
818  return *this;
819  }
820 
823  void load(Fragment &frag) const {
824 
825  iterator_.load(frag);
826  }
827 
829  CUTLASS_DEVICE
832  Fragment &frag,
834  Index pointer_offset) const {
835  iterator_.load_with_pointer_offset(frag, pointer_offset);
836  }
837 
839  CUTLASS_DEVICE
842  Fragment &frag,
844  Index byte_offset) const {
845  iterator_.load_with_byte_offset(frag, byte_offset);
846  }
847 
849  CUTLASS_DEVICE
850  void load(
852  Fragment &frag,
854  TensorCoord const &tile_offset) const {
855  // TODO
856  }
857 
859  CUTLASS_DEVICE
860  void load(
862  Fragment &frag,
864  TensorCoord const &tile_offset,
866  Index pointer_offset) const {
867  // TODO
868  }
869 
871  CUTLASS_DEVICE
874  Fragment &frag,
876  TensorCoord const &tile_offset,
878  Index byte_offset) const {
879  iterator_.load_with_byte_offset(
880  frag,
881  {tile_offset.strided(), tile_offset.contiguous()},
882  byte_offset);
883  }
884 
892  CUTLASS_DEVICE
893  void set_kgroup_index(int k_group) {
894  iterator_.set_kgroup_index(k_group);
895  }
896 };
897 
899 
907 template <
909  typename Shape_,
911  Operand Operand_,
913  typename Element_,
915  typename InstructionShape_,
918  int OpDelta_,
920  int Crosswise,
922  int PartitionsK_>
924  Shape_, Operand_, Element_,
925  cutlass::layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
926  Crosswise>,
927  InstructionShape_, OpDelta_, 32, PartitionsK_> {
928  public:
930  using Shape = Shape_;
931 
933  static Operand const kOperand = Operand_;
934 
935  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
936  "MmaTensorOpMultiplicandIterator may only be instantiated for "
937  "A or B operands to warp-level Mma.");
938 
940  using Element = Element_;
941 
943  static int const kCrosswise = Crosswise;
944 
948 
950  using InstructionShape = InstructionShape_;
951 
954  static int const kOpDelta = OpDelta_;
955 
957  static int const kThreads = 32;
958 
960  static int const kPartitionsK = PartitionsK_;
961 
964 
966  using Index = typename TensorRef::Index;
967 
969  using LongIndex = typename TensorRef::LongIndex;
970 
973 
975  struct Policy {
977  !(Shape::kContiguous % InstructionShape::kContiguous),
978  "Shape of warp-level Mma must be divisible by operator shape.");
979 
980  // Determine number of elements along outer dimension per individual LDSM op
981  static int const kLdsmOpOuter = Layout::kElementsPerAccess;
982  static int const kLdsmOpInner = 8;
983 
984  static_assert(!(Shape::kContiguous % kLdsmOpOuter),
985  "Shape of warp-level mma must be divisible by LDSM's "
986  "fundamental tile size.");
987 
988  static_assert(!(Shape::kStrided % kLdsmOpInner),
989  "Shape of warp-level mma must be divisible by LDSM's "
990  "fundamental tile size.");
991 
993  static int const LdsmShapeContiguous =
994  InstructionShape::kContiguous / kLdsmOpOuter;
995  static int const LdsmShapeStrided =
996  ((4 / LdsmShapeContiguous * kLdsmOpInner) > Shape::kStrided)
997  ? (Shape::kStrided / kLdsmOpInner)
998  : (4 / LdsmShapeContiguous);
999  using LdsmShape =
1001 
1003  using LdsmIterations =
1004  layout::PitchLinearShape<1, Shape::kStrided / kLdsmOpInner /
1005  LdsmShape::kStrided>;
1006 
1008  static int const kGroupsPerTile = Layout::TileShape::kContiguous /
1009  Layout::kFactor / LdsmShape::kContiguous;
1010  };
1011 
1012  private:
1014  static_assert(kOpDelta == 1,
1015  "Alternative arrangements not supported at present.");
1016 
1018  using AccessType = Array<Element, Layout::kElementsPerAccess>;
1019 
1020  public:
1021  //
1022  // Derived quantities
1023  //
1024 
1026  using Fragment = Array<Element, Shape::kCount / kThreads>;
1027 
1028  private:
1029 
1034  int sections_;
1035 
1037  Index stride_;
1038 
1040  AccessType const *pointer_;
1041 
1043  Index byte_offset_;
1044 
1047  int k_group_idx_;
1048 
1049  public:
1053  : pointer_(nullptr),
1054  sections_(0),
1055  stride_(0),
1056  byte_offset_(0),
1057  k_group_idx_(0) {}
1058 
1060  CUTLASS_DEVICE
1062  : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
1063  sections_(ref.stride(0) / kCrosswise),
1064  // stride_ = kCrosswise x sections_ x kFactor
1065  stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
1066  byte_offset_(0),
1067  k_group_idx_(0) {
1068  // Warp level iterator at most use double buffer to hide latency. If there
1069  // are more than 2 sections, every stage should have more than 1 section.
1070  // TODO: refactor code after every case is implemented
1071 
1072  // Turing silicon requires all 32 threads in a warp provide valid addresses
1073  // even for LDSM.1 and LDSM.2
1074 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 750))
1075  lane_id = lane_id % (Policy::LdsmShape::kCount * Policy::kLdsmOpInner);
1076 #endif
1077 
1078  int lane_in_pair = (lane_id & 1);
1079  int lane_in_quad = (lane_id & 3);
1080  int lane_in_quad_pair = (lane_id & 7);
1081  int lane_in_quad_quad = (lane_id & 15);
1082 
1083  int partition_contiguous_idx = -1;
1084  int access_contiguous_idx = -1;
1085  int access_strided_idx = -1;
1086 
1087  if (Layout::kFactor == 4) {
1088  // Super Integer matrix multiply Interleaved-32
1089 
1090  int factor_in_partition =
1091  (Layout::PartitionShape::kContiguous * Layout::kFactor /
1092  Layout::TileShape::kContiguous);
1093 
1094  if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
1095  // Integer matrix multiply 8816 A/B
1096  partition_contiguous_idx = lane_in_quad / factor_in_partition;
1097  access_contiguous_idx = ((lane_in_pair * factor_in_partition) ^
1098  (lane_in_quad_quad / Layout::kFactor));
1099  access_strided_idx = lane_id / Layout::kFactor;
1100  }
1101  } else if (Layout::kFactor == 2) {
1102  // Super Matrix multiply kBlock = 32
1103  if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
1104  // (Q stands for 1 8x128bit block).
1105  // Q0
1106  // Q1
1107  // Q2
1108  // Q3
1109  // Four blocks are next to each other in the strided dimension.
1110  partition_contiguous_idx = (lane_id % Layout::kFactor);
1111  access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor);
1112  access_strided_idx = lane_id / Layout::kFactor;
1113  }
1114  } else if (Layout::kFactor == 1) {
1115  // Super Matrix multiply kBlock = 64
1116  if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
1117  // Q0
1118  // Q1
1119  // Q2
1120  // Q3
1121  partition_contiguous_idx = (lane_in_quad_pair >> 2);
1122  access_contiguous_idx = lane_in_quad;
1123  access_strided_idx = lane_id;
1124  }
1125  }
1126 
1127  int access_contiguous =
1128  partition_contiguous_idx * Layout::PartitionShape::kContiguous +
1129  access_contiguous_idx;
1130 
1131  int access_strided = access_strided_idx;
1132 
1133  byte_offset_ = (access_contiguous + access_strided * stride_) *
1134  sizeof_bits<Element>::value * Layout::kElementsPerAccess / 8;
1135  }
1136 
1138  CUTLASS_DEVICE
1140  byte_offset_ += offset * sizeof_bits<Element>::value / 8;
1141 
1142  return *this;
1143  }
1144 
1147  CUTLASS_DEVICE
1149  TensorCoord const &tile_offset) {
1150  int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
1151  int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
1152 
1153  byte_offset_ ^= k_groups_delta * sizeof_bits<Element>::value *
1154  Layout::kElementsPerAccess / 8;
1155  pointer_ +=
1156  tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor +
1157  whole_tiles * stride_ / sections_;
1158  return *this;
1159  }
1160 
1162  CUTLASS_DEVICE
1164 
1165  // Integer matrix multiply 8816 Interleaved-32
1166  // ^1 ^1
1167  // Matrix multiply 1688 kblock=32 || Integer matrix multiply 8816 kblock=64
1168  // ^1 ^3 ^1 ^3
1169  // Matrix multiply 1688 kblock=64
1170  // ^1 ^3 ^1 ^7 ^1 ^3 ^1 ^7
1171  if ((Policy::kGroupsPerTile / kPartitionsK) > 1) {
1172  int mask = ((Policy::kGroupsPerTile / kPartitionsK) == 8)
1173  ? 3
1174  : (((Policy::kGroupsPerTile / kPartitionsK) == 4) ? 1 : 0);
1175 
1176  if (((k_group_idx_ & mask) % 2) == 0)
1177  byte_offset_ ^= 1 * Policy::LdsmShape::kContiguous *
1179  Layout::kElementsPerAccess / 8;
1180  else if ((k_group_idx_ & mask) == 1)
1181  byte_offset_ ^= 3 * Policy::LdsmShape::kContiguous *
1183  Layout::kElementsPerAccess / 8;
1184  else if ((k_group_idx_ & mask) == 3)
1185  byte_offset_ ^= 7 * Policy::LdsmShape::kContiguous *
1187  Layout::kElementsPerAccess / 8;
1188  }
1189 
1190  k_group_idx_++;
1191 
1192  if (k_group_idx_ == (Policy::kGroupsPerTile / kPartitionsK)) {
1193  k_group_idx_ = 0;
1194  add_tile_offset({Policy::kGroupsPerTile, 0});
1195  }
1196 
1197  return *this;
1198  }
1199 
1203 
1206  CUTLASS_DEVICE
1208  TensorCoord const &tile_offset) {
1209  add_tile_offset(tile_offset);
1210  return *this;
1211  }
1212 
1215  CUTLASS_DEVICE
1217  TensorCoord const &tile_offset) {
1218  add_tile_offset(-tile_offset);
1219  return *this;
1220  }
1221 
1224  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
1225 
1227  CUTLASS_DEVICE
1230  Fragment &frag,
1232  Index byte_offset) const {
1233  Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
1234  reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
1235 
1237  for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
1239  for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
1240  int access_idx = c + s * Policy::LdsmIterations::kContiguous;
1241 
1242  AccessType const *source_ptr =
1243  pointer_ + Policy::LdsmShape::kContiguous * c +
1244  Policy::kLdsmOpInner / Layout::kFactor *
1245  Policy::LdsmShape::kStrided * s * stride_;
1246 
1247  char const *source_byte_ptr =
1248  reinterpret_cast<char const *>(source_ptr) + byte_offset +
1249  byte_offset_;
1250 
1251  cutlass::arch::ldsm<layout::RowMajor, Policy::LdsmShape::kCount>(
1252  fetch_ptr[access_idx], source_byte_ptr);
1253  }
1254  }
1255  }
1256 
1258  CUTLASS_DEVICE
1261  Fragment &frag,
1263  Index pointer_offset) const {
1264  load_with_byte_offset(frag, pointer_offset * sizeof(Element));
1265  }
1266 
1268  CUTLASS_DEVICE
1269  void load(
1271  Fragment &frag,
1273  TensorCoord const &tile_offset) const {
1274  load_with_byte_offset(frag, tile_offset, 0);
1275  }
1276 
1278  CUTLASS_DEVICE
1279  void load(
1281  Fragment &frag,
1283  TensorCoord const &tile_offset,
1285  Index pointer_offset) const {
1286  load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
1287  }
1288 
1290  CUTLASS_DEVICE
1293  Fragment &frag,
1295  TensorCoord const &tile_offset,
1297  Index byte_offset) const {
1298  Index pointer_offset = tile_offset.contiguous() *
1299  InstructionShape::kContiguous /
1300  Layout::kElementsPerAccess +
1301  tile_offset.strided() * Shape::kStrided * stride_;
1302 
1303  byte_offset += sizeof_bits<AccessType>::value * pointer_offset / 8;
1304 
1305  load_with_byte_offset(frag, byte_offset);
1306  }
1307 
1315  CUTLASS_DEVICE
1316  void set_kgroup_index(int k_group) {
1317  k_group_idx_ = k_group % (Policy::kGroupsPerTile / kPartitionsK);
1318  }
1319 };
1320 
1322 
1330 template <
1332  typename Shape_,
1334  Operand Operand_,
1336  typename Element_,
1338  typename InstructionShape_,
1341  int OpDelta_,
1343  int Crosswise,
1345  int PartitionsK_>
1347  Shape_, Operand_, Element_,
1349  sizeof_bits<Element_>::value, Crosswise>,
1350  InstructionShape_, OpDelta_, 32, PartitionsK_> {
1351  public:
1353  using Shape = Shape_;
1354 
1356  static Operand const kOperand = Operand_;
1357 
1358  static_assert(kOperand == Operand::kB,
1359  "MmaTensorOpMultiplicandIterator for ColumnMajor Crosswise may "
1360  "only be instantiated for B operand to warp-level Mma.");
1361 
1363  using Element = Element_;
1364 
1366  static int const kCrosswise = Crosswise;
1367 
1371 
1373  using InstructionShape = InstructionShape_;
1374 
1377  static int const kOpDelta = OpDelta_;
1378 
1380  static int const kThreads = 32;
1381 
1384 
1386  using Index = typename TensorRef::Index;
1387 
1390 
1393 
1396  layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
1398  kCrosswise>,
1399  layout::PitchLinearShape<InstructionShape::kRow,
1400  InstructionShape::kColumn>,
1401  kOpDelta, kThreads, PartitionsK_>;
1402 
1403  public:
1404  //
1405  // Derived quantities
1406  //
1407 
1409  using Fragment = Array<Element, Shape::kCount / kThreads>;
1410 
1411  private:
1413  Base iterator_;
1414 
1415  public:
1419 
1423  : iterator_({ref.data(), ref.stride()}, lane_id) {}
1424 
1428  iterator_.add_pointer_offset(offset);
1429 
1430  return *this;
1431  }
1432 
1437  TensorCoord const &tile_offset) {
1438  iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
1439 
1440  return *this;
1441  }
1442 
1446  ++iterator_;
1447 
1448  return *this;
1449  }
1450 
1454  --iterator_;
1455 
1456  return *this;
1457  }
1458 
1461  CUTLASS_DEVICE
1463  TensorCoord const &tile_offset) {
1464  add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
1465  return *this;
1466  }
1467 
1470  CUTLASS_DEVICE
1472  TensorCoord const &tile_offset) {
1473  add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
1474  return *this;
1475  }
1476 
1479  void load(Fragment &frag) const { iterator_.load(frag); }
1480 
1482  CUTLASS_DEVICE
1485  Fragment &frag,
1487  Index pointer_offset) const {
1488  iterator_.load_with_pointer_offset(frag, pointer_offset);
1489  }
1490 
1492  CUTLASS_DEVICE
1495  Fragment &frag,
1497  Index byte_offset) const {
1498  iterator_.load_with_byte_offset(frag, byte_offset);
1499  }
1500 
1502  CUTLASS_DEVICE
1503  void load(
1505  Fragment &frag,
1507  TensorCoord const &tile_offset) const {
1508  // TODO
1509  assert(0);
1510  }
1511 
1513  CUTLASS_DEVICE
1514  void load(
1516  Fragment &frag,
1518  TensorCoord const &tile_offset,
1520  Index pointer_offset) const {
1521  // TODO
1522  assert(0);
1523  }
1524 
1526  CUTLASS_DEVICE
1529  Fragment &frag,
1531  TensorCoord const &tile_offset,
1533  Index byte_offset) const {
1534  iterator_.load_with_byte_offset(
1535  frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
1536  }
1537 
1545  CUTLASS_DEVICE
1546  void set_kgroup_index(int k_group) {
1547  iterator_.set_kgroup_index(k_group);
1548  }
1549 };
1550 
1552 
1560 template <
1562  typename Shape_,
1564  Operand Operand_,
1566  typename Element_,
1568  typename InstructionShape_,
1571  int OpDelta_,
1573  int Crosswise,
1575  int PartitionsK_>
1577  Shape_, Operand_, Element_,
1579  sizeof_bits<Element_>::value, Crosswise>,
1580  InstructionShape_, OpDelta_, 32, PartitionsK_> {
1581  public:
1583  using Shape = Shape_;
1584 
1586  static Operand const kOperand = Operand_;
1587 
1588  static_assert(kOperand == Operand::kA,
1589  "MmaTensorOpMultiplicandIterator for RowMajor Crosswise may "
1590  "only be instantiated for A operand to warp-level Mma.");
1591 
1593  using Element = Element_;
1594 
1596  static int const kCrosswise = Crosswise;
1597 
1601 
1603  using InstructionShape = InstructionShape_;
1604 
1607  static int const kOpDelta = OpDelta_;
1608 
1610  static int const kThreads = 32;
1611 
1614 
1616  using Index = typename TensorRef::Index;
1617 
1620 
1623 
1626  layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
1628  kCrosswise>,
1629  layout::PitchLinearShape<InstructionShape::kColumn,
1630  InstructionShape::kRow>,
1631  kOpDelta, kThreads, PartitionsK_>;
1632 
1633  public:
1634  //
1635  // Derived quantities
1636  //
1637 
1639  using Fragment = Array<Element, Shape::kCount / kThreads>;
1640 
1641  private:
1643  Base iterator_;
1644 
1645  public:
1649 
1653  : iterator_({ref.data(), ref.stride()}, lane_id) {}
1654 
1658  iterator_.add_pointer_offset(offset);
1659 
1660  return *this;
1661  }
1662 
1667  TensorCoord const &tile_offset) {
1668  iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
1669 
1670  return *this;
1671  }
1672 
1676  ++iterator_;
1677 
1678  return *this;
1679  }
1680 
1684  --iterator_;
1685 
1686  return *this;
1687  }
1688 
1691  CUTLASS_DEVICE
1693  TensorCoord const &tile_offset) {
1694  add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
1695  return *this;
1696  }
1697 
1700  CUTLASS_DEVICE
1702  TensorCoord const &tile_offset) {
1703  add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
1704  return *this;
1705  }
1706 
1709  void load(Fragment &frag) const { iterator_.load(frag); }
1710 
1712  CUTLASS_DEVICE
1715  Fragment &frag,
1717  Index pointer_offset) const {
1718  iterator_.load_with_pointer_offset(frag, pointer_offset);
1719  }
1720 
1722  CUTLASS_DEVICE
1725  Fragment &frag,
1727  Index byte_offset) const {
1728  iterator_.load_with_byte_offset(frag, byte_offset);
1729  }
1730 
1732  CUTLASS_DEVICE
1733  void load(
1735  Fragment &frag,
1737  TensorCoord const &tile_offset) const {
1738  // TODO
1739  assert(0);
1740  }
1741 
1743  CUTLASS_DEVICE
1744  void load(
1746  Fragment &frag,
1748  TensorCoord const &tile_offset,
1750  Index pointer_offset) const {
1751  // TODO
1752  assert(0);
1753  }
1754 
1756  CUTLASS_DEVICE
1759  Fragment &frag,
1761  TensorCoord const &tile_offset,
1763  Index byte_offset) const {
1764  iterator_.load_with_byte_offset(
1765  frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
1766  }
1767 
1775  CUTLASS_DEVICE
1776  void set_kgroup_index(int k_group) {
1777  iterator_.set_kgroup_index(k_group);
1778  }
1779 };
1780 
1782 template <
1784  typename Shape_,
1786  typename Element_,
1788  typename Layout_,
1790  typename InstructionShape_,
1793  typename OpDelta_>
1795 
1797 
1806 template <
1808  typename Shape_,
1810  typename Element_,
1812  typename InstructionShape_,
1815  typename OpDelta_>
1817  Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
1818  public:
1819 
1821  using Shape = Shape_;
1822 
1824  static Operand const kOperand = Operand::kC;
1825 
1827  using Element = Element_;
1828 
1831 
1833  using InstructionShape = InstructionShape_;
1834 
1836  using OpDelta = OpDelta_;
1837 
1839  static int const kThreads = 32;
1840 
1843 
1845  using Index = typename TensorRef::Index;
1846 
1849 
1852 
1854  struct Policy {
1855  static_assert(
1856  !(Shape::kRow % InstructionShape::kM) &&
1857  !(Shape::kColumn % InstructionShape::kN),
1858  "Shape of warp-level Mma must be divisible by operator shape.");
1859 
1861  "Layouts must be defined for logical MatrixCoord coordinate space.");
1862 
1864  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
1865  Shape::kColumn / InstructionShape::kN>;
1866  };
1867 
1868 private:
1869 
1870  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
1871  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
1872  // of that row. The accumulators within one row are assumed to be consecutive.
1873  static int const kElementsPerAccess = InstructionShape::kN / 4;
1874  static int const kRowsPerTile = 8;
1875  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
1876 
1877 public:
1878 
1879  //
1880  // Derived quantities
1881  //
1882 
1884  using Fragment = Array<Element, Shape::kCount / kThreads>;
1885 
1886 private:
1887 
1889  TensorRef ref_;
1890 
1891 public:
1892 
1896 
1900  TensorRef const &ref,
1901  int lane_id
1902  ):
1903  ref_(ref) {
1904 
1905  int quad = (lane_id >> 2);
1906  int lane_in_quad = (lane_id & 3);
1907 
1908  MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
1909 
1910  ref_.add_coord_offset(lane_offset);
1911  }
1912 
1916  ref_.add_pointer_offset(offset);
1917  return *this;
1918  }
1919 
1923 
1924  ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
1925 
1926  return *this;
1927  }
1928 
1932  // deliberate no-op
1933  return *this;
1934  }
1935 
1939  // deliberate no-op
1940  return *this;
1941  }
1942 
1944  CUTLASS_DEVICE
1946  add_tile_offset(tile_offset);
1947  return *this;
1948  }
1949 
1951  CUTLASS_DEVICE
1953  add_tile_offset(-tile_offset);
1954  return *this;
1955  }
1956 
1959  void load(Fragment &frag) const {
1960  load_with_pointer_offset(frag, 0);
1961  }
1962 
1964  CUTLASS_DEVICE
1966  Fragment &frag,
1967  Index pointer_offset) const {
1968 
1969  TensorRef offset_ref(ref_);
1970  offset_ref.add_pointer_offset(pointer_offset);
1971 
1973  for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
1975  for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
1976 
1977  int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
1978  (mma_n * Policy::MmaIterations::kRow + mma_m);
1979 
1981  for (int row = 0; row < kAccumulatorRows; ++row) {
1983  for (int col = 0; col < kElementsPerAccess; ++col) {
1984  int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
1985  row * kRowsPerTile;
1986  int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
1987 
1988  frag[mma_accum_start + row * kElementsPerAccess + col] = offset_ref.at({accum_m, accum_n});
1989  }
1990  }
1991  }
1992  }
1993  }
1994 
1996  CUTLASS_DEVICE
1998  Fragment &frag,
1999  Index byte_offset) const {
2000 
2001  load_with_pointer_offset(byte_offset / sizeof(Element));
2002  }
2003 
2005  CUTLASS_DEVICE
2006  void load(
2007  Fragment &frag,
2008  TensorCoord const &tile_offset) const {
2009 
2010  load(frag, tile_offset, 0);
2011  }
2012 
2014  CUTLASS_DEVICE
2015  void load(
2016  Fragment &frag,
2017  TensorCoord const &tile_offset,
2018  Index pointer_offset) const {
2019 
2020  load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
2021  }
2022 
2025  void store(Fragment const &frag) const {
2026  store_with_pointer_offset(frag, 0);
2027  }
2028 
2030  CUTLASS_DEVICE
2032  Fragment const &frag,
2033  Index pointer_offset) const {
2034 
2035  TensorRef offset_ref(ref_);
2036  offset_ref.add_pointer_offset(pointer_offset);
2037 
2039  for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
2041  for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
2042 
2043  int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
2044  (mma_n * Policy::MmaIterations::kRow + mma_m);
2045 
2047  for (int row = 0; row < kAccumulatorRows; ++row) {
2049  for (int col = 0; col < kElementsPerAccess; ++col) {
2050  int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
2051  row * kRowsPerTile;
2052  int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
2053  int idx = mma_accum_start + row * kElementsPerAccess + col;
2054 
2055  offset_ref.at({accum_m, accum_n}) = frag[idx];
2056  }
2057  }
2058  }
2059  }
2060  }
2061 
2063  CUTLASS_DEVICE
2065  Fragment const &frag,
2066  Index byte_offset) const {
2067 
2068  store_with_pointer_offset(byte_offset / sizeof(Element));
2069  }
2070 
2072  CUTLASS_DEVICE
2073  void store(
2074  Fragment &frag,
2075  TensorCoord const &tile_offset) const {
2076 
2077  store(frag, tile_offset, 0);
2078  }
2079 
2081  CUTLASS_DEVICE
2082  void store(
2084  Fragment const &frag,
2086  TensorCoord const &tile_offset,
2088  Index pointer_offset) const {
2089  store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
2090  }
2091 };
2092 
2094 
2103 template <
2105  typename Shape_,
2107  typename Element_,
2109  typename InstructionShape_,
2112  typename OpDelta_>
2113 class MmaTensorOpAccumulatorTileIterator<Shape_, Element_,
2115  InstructionShape_, OpDelta_> {
2116  public:
2117 
2119  using Shape = Shape_;
2120 
2122  static Operand const kOperand = Operand::kC;
2123 
2125  using Element = Element_;
2126 
2129 
2131  using InstructionShape = InstructionShape_;
2132 
2134  using OpDelta = OpDelta_;
2135 
2137  static int const kThreads = 32;
2138 
2141 
2143  using Index = typename TensorRef::Index;
2144 
2147 
2150 
2152  struct Policy {
2153  static_assert(
2154  !(Shape::kRow % InstructionShape::kM) &&
2155  !(Shape::kColumn % InstructionShape::kN),
2156  "Shape of warp-level Mma must be divisible by operator shape.");
2157 
2159  "Layouts must be defined for logical MatrixCoord coordinate space.");
2160 
2162  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
2163  Shape::kColumn / InstructionShape::kN>;
2164  };
2165 
2166 private:
2167 
2168  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
2169  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
2170  // of that row. The accumulators within one row are assumed to be consecutive.
2171  static int const kElementsPerAccess = InstructionShape::kN / 4;
2172  static int const kRowsPerTile = 8;
2173  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
2174 
2175 public:
2176 
2177  //
2178  // Derived quantities
2179  //
2180 
2182  using Fragment = Array<Element, Shape::kCount / kThreads>;
2183 
2184 private:
2185 
2187  TensorRef ref_;
2188 
2189 public:
2190 
2194 
2198  TensorRef const &ref,
2199  int lane_id
2200  ):
2201  ref_(ref) {
2202 
2203  int quad = (lane_id >> 2);
2204  int lane_in_quad = (lane_id & 3);
2205 
2206  MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
2207 
2208  ref_.add_coord_offset(lane_offset);
2209  }
2210 
2214  ref_.add_pointer_offset(offset);
2215  return *this;
2216  }
2217 
2221 
2222  ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
2223 
2224  return *this;
2225  }
2226 
2230  // deliberate no-op
2231  return *this;
2232  }
2233 
2237  // deliberate no-op
2238  return *this;
2239  }
2240 
2242  CUTLASS_DEVICE
2244  add_tile_offset(tile_offset);
2245  return *this;
2246  }
2247 
2249  CUTLASS_DEVICE
2251  add_tile_offset(-tile_offset);
2252  return *this;
2253  }
2254 
2257  void load(Fragment &frag) const {
2258  load_with_pointer_offset(frag, 0);
2259  }
2260 
2262  CUTLASS_DEVICE
2264  Fragment &frag,
2265  Index pointer_offset) const {
2266 
2267  TensorRef offset_ref(ref_);
2268  offset_ref.add_pointer_offset(pointer_offset);
2269 
2271  for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
2273  for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
2274 
2275  int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
2276  (mma_n * Policy::MmaIterations::kRow + mma_m);
2277 
2279  for (int row = 0; row < kAccumulatorRows; ++row) {
2281  for (int col = 0; col < kElementsPerAccess; ++col) {
2282  int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
2283  row * kRowsPerTile;
2284  int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
2285  int idx = mma_accum_start + row * kElementsPerAccess + col;
2286 
2287  frag[idx] = offset_ref.at({accum_m, accum_n});
2288  }
2289  }
2290  }
2291  }
2292  }
2293 
2295  CUTLASS_DEVICE
2297  Fragment &frag,
2298  Index byte_offset) const {
2299 
2300  load_with_pointer_offset(byte_offset / sizeof(Element));
2301  }
2302 
2304  CUTLASS_DEVICE
2305  void load(
2306  Fragment &frag,
2307  TensorCoord const &tile_offset) const {
2308 
2309  load(frag, tile_offset, 0);
2310  }
2311 
2313  CUTLASS_DEVICE
2314  void load(
2315  Fragment &frag,
2316  TensorCoord const &tile_offset,
2317  Index pointer_offset) const {
2318 
2319  load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
2320  }
2321 
2324  void store(Fragment const &frag) const {
2325  store_with_pointer_offset(frag, 0);
2326  }
2327 
2329  CUTLASS_DEVICE
2331  Fragment const &frag,
2332  Index pointer_offset) const {
2333 
2334  TensorRef offset_ref(ref_);
2335  offset_ref.add_pointer_offset(pointer_offset);
2336 
2338  for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
2340  for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
2341 
2342  int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
2343  (mma_n * Policy::MmaIterations::kRow + mma_m);
2344 
2346  for (int row = 0; row < kAccumulatorRows; ++row) {
2348  for (int col = 0; col < kElementsPerAccess; ++col) {
2349  int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
2350  row * kRowsPerTile;
2351  int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
2352  int idx = mma_accum_start + row * kElementsPerAccess + col;
2353 
2354  offset_ref.at({accum_m, accum_n}) = frag[idx];
2355  }
2356  }
2357  }
2358  }
2359  }
2360 
2362  CUTLASS_DEVICE
2364  Fragment const &frag,
2365  Index byte_offset) const {
2366 
2367  store_with_pointer_offset(byte_offset / sizeof(Element));
2368  }
2369 
2371  CUTLASS_DEVICE
2372  void store(
2373  Fragment &frag,
2374  TensorCoord const &tile_offset) const {
2375 
2376  store(frag, tile_offset, 0);
2377  }
2378 
2380  CUTLASS_DEVICE
2381  void store(
2383  Fragment const &frag,
2385  TensorCoord const &tile_offset,
2387  Index pointer_offset) const {
2388  store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
2389  }
2390 };
2391 
2393 
2402 
2403 template <
2405  typename Shape_,
2407  typename Element_,
2409  typename InstructionShape_,
2412  typename OpDelta_,
2414  int InterleavedN>
2416  Shape_, Element_, cutlass::layout::ColumnMajorInterleaved<InterleavedN>,
2417  InstructionShape_, OpDelta_> {
2418  public:
2419 
2421  using Shape = Shape_;
2422 
2424  static Operand const kOperand = Operand::kC;
2425 
2427  using Element = Element_;
2428 
2431 
2433  using InstructionShape = InstructionShape_;
2434 
2436  using OpDelta = OpDelta_;
2437 
2439  static int const kThreads = 32;
2440 
2443 
2445  using Index = typename TensorRef::Index;
2446 
2449 
2452 
2454  struct Policy {
2455  static_assert(
2456  !(Shape::kRow % InstructionShape::kM) &&
2457  !(Shape::kColumn % InstructionShape::kN),
2458  "Shape of warp-level Mma must be divisible by operator shape.");
2459 
2461  "Layouts must be defined for logical MatrixCoord coordinate space.");
2462 
2464  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
2465  Shape::kColumn / InstructionShape::kN>;
2466  };
2467 
2468 private:
2469 
2470  static int const kElementsPerAccess = 2;
2471 
2472 public:
2473 
2474  //
2475  // Derived quantities
2476  //
2477 
2478  using AccessType = Array<Element, kElementsPerAccess>;
2479 
2481  using Fragment = Array<Element, Shape::kCount / kThreads>;
2482 
2483 private:
2484 
2486  TensorRef ref_;
2487 
2488 public:
2489 
2493 
2497  TensorRef const &ref,
2498  int lane_id
2499  ):
2500  ref_(ref) {
2501 
2502  int quad = (lane_id >> 2);
2503  int lane_in_quad = (lane_id & 3);
2504 
2505  MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
2506 
2507  ref_.add_coord_offset(lane_offset);
2508  }
2509 
2513  ref_.add_pointer_offset(offset);
2514  return *this;
2515  }
2516 
2520 
2521  ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
2522 
2523  return *this;
2524  }
2525 
2529  // deliberate no-op
2530  return *this;
2531  }
2532 
2536  // deliberate no-op
2537  return *this;
2538  }
2539 
2541  CUTLASS_DEVICE
2543  add_tile_offset(tile_offset);
2544  return *this;
2545  }
2546 
2548  CUTLASS_DEVICE
2550  add_tile_offset(-tile_offset);
2551  return *this;
2552  }
2553 
2556  void load(Fragment &frag) const {
2557  load_with_pointer_offset(frag, 0);
2558  }
2559 
2561  CUTLASS_DEVICE
2563  Fragment &frag,
2564  Index pointer_offset) const {
2565 
2566  TensorRef offset_ref(ref_);
2567  offset_ref.add_pointer_offset(pointer_offset);
2568 
2569  AccessType* frag_ptr = reinterpret_cast<AccessType *>(&frag);
2570 
2572  for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
2574  for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
2575  int accum_m = mma_m * InstructionShape::kM;
2576  int accum_n = mma_n * InstructionShape::kN;
2577 
2578  int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
2579 
2580  AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
2581  offset_ref.offset(TensorCoord(accum_m, accum_n)));
2582 
2583  frag_ptr[idx] = access_ptr[0];
2584  }
2585  }
2586  }
2587 
2589  CUTLASS_DEVICE
2591  Fragment &frag,
2592  Index byte_offset) const {
2593 
2594  load_with_pointer_offset(byte_offset / sizeof(Element));
2595  }
2596 
2598  CUTLASS_DEVICE
2599  void load(
2600  Fragment &frag,
2601  TensorCoord const &tile_offset) const {
2602 
2603  load(frag, tile_offset, 0);
2604  }
2605 
2607  CUTLASS_DEVICE
2608  void load(
2609  Fragment &frag,
2610  TensorCoord const &tile_offset,
2611  Index pointer_offset) const {
2612 
2613  load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
2614  }
2615 
2618  void store(Fragment const &frag) const {
2619  store_with_pointer_offset(frag, 0);
2620  }
2621 
2623  CUTLASS_DEVICE
2625  Fragment const &frag,
2626  Index pointer_offset) const {
2627 
2628  TensorRef offset_ref(ref_);
2629  offset_ref.add_pointer_offset(pointer_offset);
2630 
2631  AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
2632 
2634  for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
2636  for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
2637  int accum_m = mma_m * InstructionShape::kM;
2638  int accum_n = mma_n * InstructionShape::kN;
2639 
2640  int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
2641 
2642  AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
2643  offset_ref.offset(TensorCoord(accum_m, accum_n)));
2644 
2645  access_ptr[0] = frag_ptr[idx];
2646  }
2647  }
2648  }
2649 
2651  CUTLASS_DEVICE
2653  Fragment const &frag,
2654  Index byte_offset) const {
2655 
2656  store_with_pointer_offset(byte_offset / sizeof(Element));
2657  }
2658 
2660  CUTLASS_DEVICE
2661  void store(
2662  Fragment &frag,
2663  TensorCoord const &tile_offset) const {
2664 
2665  store(frag, tile_offset, 0);
2666  }
2667 
2669  CUTLASS_DEVICE
2670  void store(
2672  Fragment const &frag,
2674  TensorCoord const &tile_offset,
2676  Index pointer_offset) const {
2677  store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
2678  }
2679 };
2680 
2682 } // namespace warp
2683 } // namespace gemm
2684 } // namespace cutlass
2685 
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator.h:1836
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:2119
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:1139
Describes the size of a matrix tile.
Definition: matrix_shape.h:42
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:1915
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:2433
Definition: aligned_buffer.h:35
Defines a structure containing strides, bounds, and a pointer to tensor data.
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1514
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1259
Definition: tensor_op_multiplicand_sm75.h:734
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:257
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:2496
CUTLASS_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const
Stores a fragment to memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2073
CUTLASS_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const
Stores a fragment to memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2661
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:2236
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
std::is_same (false specialization)
Definition: platform.h:394
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1163
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:773
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:2296
CUTLASS_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Stores a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2381
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:321
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1952
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:408
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:387
Operand
GEMM operand enumeration: D = A * B + C.
Definition: include/cutlass/gemm/gemm.h:39
CUTLASS_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Stores a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2082
Definition: tensor_op_multiplicand_sm75.h:422
Architecture-specific operators on memory added for SM75.
Definition: tensor_op_multiplicand_sm75.h:835
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1279
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:872
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:1224
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:293
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:840
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1965
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1922
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:1945
Defines common types used for all GEMM-like operators.
Definition: tensor_op_multiplicand_sm75.h:213
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:1833
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1683
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:2562
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2031
C++ features that may be otherwise unimplemented for CUDA device functions.
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:2146
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:2263
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:809
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:2243
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:1895
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator.h:1884
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:860
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:599
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:609
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1757
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:2229
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the opposite of the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:312
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2519
CUTLASS_HOST_DEVICE TensorRef & add_coord_offset(TensorCoord const &coord)
Adds an offset to each pointer.
Definition: tensor_ref.h:326
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:641
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:266
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:2556
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1723
Mapping function for column-major matrices.
Definition: layout/matrix.h:142
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1938
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:2197
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:830
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:2528
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:782
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1503
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1483
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2220
Defines layout functions used by TensorRef and derived classes for common 4-D and 5-D tensor formats...
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator.h:2436
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:1851
CUTLASS_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2652
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:2193
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277
CUTLASS_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const
Stores a fragment to memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2372
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:1427
typename Layout::TensorCoord TensorCoord
Coordinate in logical tensor space.
Definition: tensor_ref.h:171
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory.
Definition: mma_tensor_op_tile_iterator.h:2025
Defines a Shape template for matrix tiles.
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:386
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1527
Defines the size of an element in bits.
Definition: numeric_types.h:42
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:342
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1291
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1997
Definition: mma_tensor_op_tile_iterator.h:1794
#define nullptr
nullptr
Definition: platform.h:144
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:1657
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1228
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:2549
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory.
Definition: mma_tensor_op_tile_iterator.h:2618
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:823
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1675
CUTLASS_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2064
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator.h:2134
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
Top-level include for all CUTLASS numeric types.
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301
Definition: mma_tensor_op_tile_iterator.h:75
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:2512
#define static_assert(__e, __m)
Definition: platform.h:153
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2599
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1493
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:1959
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1931
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2608
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1445
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1453
CUTLASS_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2363
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2305
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:542
typename TensorRef::LongIndex LongIndex
Long Index type.
Definition: mma_tensor_op_tile_iterator.h:1848
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:1709
typename Layout::Index Index
Index type.
Definition: tensor_ref.h:165
Mapping function for row-major matrices.
Definition: layout/matrix.h:50
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1744
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2015
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:2131
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1269
Shape_ Shape
Shape of tile to load (concept: MatrixShape)
Definition: mma_tensor_op_tile_iterator.h:1821
CUTLASS_HOST_DEVICE Reference at(TensorCoord const &coord) const
Returns a reference to the element at a given Coord.
Definition: tensor_ref.h:307
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const
Stores a fragment to memory.
Definition: mma_tensor_op_tile_iterator.h:2324
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:376
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:2257
Defines layout functions used by TensorRef and derived classes.
CUTLASS_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Stores a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2670
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:396
Math utilities.
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:578
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
Definition: layout/matrix.h:343
CUTLASS_HOST_DEVICE TensorRef & add_pointer_offset(LongIndex offset_)
Adds an offset to each pointer.
Definition: tensor_ref.h:319
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:1733
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:335
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Loads a fragment from memory at the location pointed to by the iterator.
Definition: mma_tensor_op_tile_iterator.h:1479
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2330
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:1713
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const
Loads a fragment from memory with additional logical offset.
Definition: mma_tensor_op_tile_iterator.h:2590
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef.
Definition: mma_tensor_op_tile_iterator.h:1899
Definition: tensor_op_multiplicand_sm75.h:632
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:629
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:2250
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:850
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:1202
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:619
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:2149
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator.h:2182
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2314
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension.
Definition: mma_tensor_op_tile_iterator.h:2535
Basic include for CUTLASS.
Definition: matrix_coord.h:39
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator.
Definition: mma_tensor_op_tile_iterator.h:2492
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:2542
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const
Loads a fragment from memory with logical offset in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:2006
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const
Stores a fragment to memory with additional pointer offset.
Definition: mma_tensor_op_tile_iterator.h:2624
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor.
Definition: mma_tensor_op_tile_iterator.h:2451
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread&#39;s part of a tile.
Definition: mma_tensor_op_tile_iterator.h:2481
typename Layout::LongIndex LongIndex
Long index used for pointer offsets.
Definition: tensor_ref.h:168
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: mma_tensor_op_tile_iterator.h:551
Definition: tensor_op_multiplicand_sm75.h:527
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory.
Definition: mma_tensor_op_tile_iterator.h:2213