CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
regular_tile_iterator_tensor_op_sm70.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
35 #pragma once
36 
37 #include "cutlass/cutlass.h"
38 #include "cutlass/array.h"
39 #include "cutlass/matrix_coord.h"
40 #include "cutlass/tensor_ref.h"
43 
45 
47 
48 namespace cutlass {
49 namespace transform {
50 namespace threadblock {
51 
53 
61 template <
62  typename Shape_,
63  typename Element_,
64  int AdvanceRank,
65  typename ThreadMap_,
66  int Alignment
67 >
69  Shape_,
70  Element_,
71  layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
72  AdvanceRank,
73  ThreadMap_,
74  Alignment> {
75 public:
76 
77  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
78  "Specialization for pitch-linear iterator may along advance along the "
79  "contiguous(rank=0) or strided(rank=1) dimension.");
80 
81  using Shape = Shape_;
82  using Element = Element_;
84  static int const kAdvanceRank = AdvanceRank;
85 
86  using Index = typename Layout::Index;
87  using LongIndex = typename Layout::LongIndex;
88 
90  using TensorCoord = typename Layout::TensorCoord;
91 
92  using ThreadMap = ThreadMap_;
93 
95  struct Detail {
96 
98  static int const kAccessSizeInBits = 128;
99 
101  sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
102  "This iterator requires a policy whose access size is 128bs");
103 
105  static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
106  };
107 
108 
109 private:
110 
112  using AccessType = Array<Element, Layout::kElementsPerAccess>;
113 
114 public:
115 
117  using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
118 
119 private:
120 
121  //
122  // Data members
123  //
124 
126  Index stride_;
127 
129  AccessType * pointer_[Detail::kPointerCount];
130 
132  Index byte_offset_;
133 
134 public:
135 
139  TensorRef ref,
140  int thread_id
141  ): stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
142 
143  layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
144 
146  for (int i = 0; i < Detail::kPointerCount; ++i) {
147 
148  // This is the offset of a thread within a threadblock tile for a specific pointer
149  // (units of elements)
150  layout::PitchLinearCoord thread_offset_in_threadblock_tile =
151  thread_offset_base + layout::PitchLinearCoord{0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
152 
153  // initialize pointer
154  pointer_[i] = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
155  }
156  }
157 
160  void add_pointer_offset(LongIndex pointer_offset) {
161 
162  byte_offset_ += pointer_offset * sizeof(Element);
163  }
164 
168 
169  add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));
170 
171  return *this;
172  }
173 
177 
178  RegularTileIterator prev(*this);
179  this->operator++();
180 
181  return prev;
182  }
183 
185  CUTLASS_DEVICE
186  void add_tile_offset(TensorCoord const &coord) {
187  add_pointer_offset(
188  coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
189  coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
190  );
191  }
192 
194  CUTLASS_DEVICE
195  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
196 
197  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
198 
199  Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
200 
202  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
203 
204  AccessType *access_ptr = pointer_[s & 1];
205  int stride_idx = (s & ~1);
206 
208  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
209 
210  int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
211  c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
212  vec_pointer_offset;
213 
214  int access_idx = c + s * ThreadMap::Iterations::kContiguous;
215 
216  char const *access_byte_ptr = reinterpret_cast<char const *>(access_ptr + access_offset);
217 
218  frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(access_byte_ptr + byte_offset_);
219  }
220  }
221  }
222 
224  CUTLASS_DEVICE
225  void load(Fragment &frag) {
226  load_with_pointer_offset(frag, 0);
227  }
228 
230  CUTLASS_DEVICE
232  Fragment const &frag,
233  Index pointer_offset) {
234 
235  AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
236 
237  Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
238 
240  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
241 
242  AccessType *access_ptr = pointer_[s & 1];
243  int stride_idx = (s & ~1);
244 
246  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
247 
248  int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
249  c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
250  vec_pointer_offset;
251 
252  int access_idx = c + s * ThreadMap::Iterations::kContiguous;
253 
254  char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);
255 
256  *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
257  }
258  }
259  }
260 
262  CUTLASS_DEVICE
263  void store(Fragment const &frag) {
264  store_with_pointer_offset(frag, 0);
265  }
266 };
267 
269 
270 // Tile Iterator specialized for column-major congruous TensorOp formats.
277 template <
278  typename Shape_,
279  typename Element_,
280  int AdvanceRank,
281  typename ThreadMap_,
282  int Alignment
283 >
285  Shape_,
286  Element_,
287  layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
288  AdvanceRank,
289  ThreadMap_,
290  Alignment> {
291 public:
292 
293  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
294  "Specialization for column-major iterator may along advance along the "
295  "columns(rank=0) or rows(rank=1) dimension.");
296 
297  using Shape = Shape_;
298  using Element = Element_;
300  static int const kAdvanceRank = AdvanceRank;
301 
302  using Index = typename Layout::Index;
303  using LongIndex = typename Layout::LongIndex;
304 
307 
308  using ThreadMap = ThreadMap_;
309 
313  Element,
315  (kAdvanceRank == 0 ? 0 : 1),
316  ThreadMap_>;
317 
318 public:
319 
321  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
322 
323 private:
324 
326  UnderlyingIterator iterator_;
327 
328 public:
329 
333  TensorRef ref,
334  int thread_id
335  ): iterator_({ref.data(), ref.stride()}, thread_id) {
336 
337  }
338 
341  void add_pointer_offset(LongIndex pointer_offset) {
342  iterator_.add_pointer_offset(pointer_offset);
343  }
344 
346  CUTLASS_DEVICE
347  void add_tile_offset(TensorCoord const &coord) {
348  iterator_.add_tile_offset({coord.row(), coord.column()});
349  }
350 
354 
355  ++iterator_;
356  return *this;
357  }
358 
362 
363  RegularTileIterator prev(*this);
364  ++iterator_;
365 
366  return prev;
367  }
368 
370  CUTLASS_DEVICE
371  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
372  iterator_.load_with_pointer_offset(frag, pointer_offset);
373  }
374 
376  CUTLASS_DEVICE
377  void load(Fragment &frag) {
378  load_with_pointer_offset(frag, 0);
379  }
380 
382  CUTLASS_DEVICE
384  Fragment const &frag,
385  Index pointer_offset) {
386 
387  iterator_.store_with_pointer_offset(frag, pointer_offset);
388  }
389 
391  CUTLASS_DEVICE
392  void store(Fragment const &frag) {
393  store_with_pointer_offset(frag, 0);
394  }
395 };
396 
397 
399 
407 template <
408  typename Shape_,
409  typename Element_,
410  int AdvanceRank,
411  typename ThreadMap_,
412  int Alignment
413 >
415  Shape_,
416  Element_,
417  layout::RowMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
418  AdvanceRank,
419  ThreadMap_,
420  Alignment> {
421 public:
422 
423  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
424  "Specialization for row-major iterator may along advance along the "
425  "columns(rank=0) or rows(rank=1) dimension.");
426 
427  using Shape = Shape_;
428  using Element = Element_;
430  static int const kAdvanceRank = AdvanceRank;
431 
432  using Index = typename Layout::Index;
433  using LongIndex = typename Layout::LongIndex;
434 
437 
438  using ThreadMap = ThreadMap_;
439 
443  Element,
445  (kAdvanceRank == 0 ? 1 : 0),
446  ThreadMap_>;
447 
448 public:
449 
451  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
452 
453 private:
454 
456  UnderlyingIterator iterator_;
457 
458 public:
459 
463  TensorRef ref,
464  int thread_id
465  ): iterator_({ref.data(), ref.stride()}, thread_id) {
466 
467  }
468 
471  void add_pointer_offset(LongIndex pointer_offset) {
472  iterator_.add_pointer_offset(pointer_offset);
473  }
474 
476  CUTLASS_DEVICE
477  void add_tile_offset(TensorCoord const &coord) {
478  iterator_.add_tile_offset({coord.column(), coord.row()});
479  }
480 
484 
485  ++iterator_;
486  return *this;
487  }
488 
492 
493  RegularTileIterator prev(*this);
494  ++iterator_;
495 
496  return prev;
497  }
498 
500  CUTLASS_DEVICE
501  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
502  iterator_.load_with_pointer_offset(frag, pointer_offset);
503  }
504 
506  CUTLASS_DEVICE
507  void load(Fragment &frag) {
508  load_with_pointer_offset(frag, 0);
509  }
510 
512  CUTLASS_DEVICE
514  Fragment const &frag,
515  Index pointer_offset) {
516 
517  iterator_.store_with_pointer_offset(frag, pointer_offset);
518  }
519 
521  CUTLASS_DEVICE
522  void store(Fragment const &frag) {
523  store_with_pointer_offset(frag, 0);
524  }
525 };
533 template <
534  typename Shape_,
535  typename Element_,
536  int AdvanceRank,
537  typename ThreadMap_,
538  int Alignment
539 >
541  Shape_,
542  Element_,
543  layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
544  AdvanceRank,
545  ThreadMap_,
546  Alignment> {
547 public:
548 
549  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
550  "Specialization for pitch-linear iterator may along advance along the "
551  "contiguous(rank=0) or strided(rank=1) dimension.");
552 
553  using Shape = Shape_;
554  using Element = Element_;
556  static int const kAdvanceRank = AdvanceRank;
557 
558  using Index = typename Layout::Index;
559  using LongIndex = typename Layout::LongIndex;
560 
563 
564  using ThreadMap = ThreadMap_;
565 
567  struct Detail {
568 
570  static int const kAccessSizeInBits = 128;
571 
573  sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
574  "This iterator requires a policy whose access size is 128bs");
575 
577  static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
578  };
579 
580 
581 private:
582 
584  using AccessType = Array<Element, Layout::kElementsPerAccess>;
585 
586 public:
587 
589  using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
590 
591 private:
592 
593  //
594  // Data members
595  //
596 
598  Index stride_;
599 
601  AccessType * pointer_[Detail::kPointerCount];
602 
604  Index byte_offset_;
605 
606 public:
607 
611  TensorRef ref,
612  int thread_id
613  ): stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
614 
615  layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
616 
618  for (int i = 0; i < Detail::kPointerCount; ++i) {
619 
620  // This is the offset of a thread within a threadblock tile for a specific pointer
621  // (units of elements)
622  layout::PitchLinearCoord thread_offset_in_threadblock_tile =
623  thread_offset_base + layout::PitchLinearCoord{0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
624 
625  // initialize pointer
626  pointer_[i] = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
627  }
628  }
629 
632  void add_pointer_offset(LongIndex pointer_offset) {
633 
634  byte_offset_ += pointer_offset * sizeof(Element);
635  }
636 
640 
641  add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));
642 
643  return *this;
644  }
645 
649 
650  RegularTileIterator prev(*this);
651  this->operator++();
652 
653  return prev;
654  }
655 
657  CUTLASS_DEVICE
658  void add_tile_offset(TensorCoord const &coord) {
659  add_pointer_offset(
660  coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
661  coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
662  );
663  }
664 
666  CUTLASS_DEVICE
667  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
668 
669  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
670 
671  Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
672 
674  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
675 
676  AccessType *access_ptr = pointer_[s & 1];
677  int stride_idx = (s & ~1);
678 
680  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
681 
682  int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
683  c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
684  vec_pointer_offset;
685 
686  int access_idx = c + s * ThreadMap::Iterations::kContiguous;
687 
688  char const *access_byte_ptr = reinterpret_cast<char const *>(access_ptr + access_offset);
689 
690  frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(access_byte_ptr + byte_offset_);
691  }
692  }
693  }
694 
696  CUTLASS_DEVICE
697  void load(Fragment &frag) {
698  load_with_pointer_offset(frag, 0);
699  }
700 
702  CUTLASS_DEVICE
704  Fragment const &frag,
705  Index pointer_offset) {
706 
707  AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
708 
709  Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
710 
712  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
713 
714  AccessType *access_ptr = pointer_[s & 1];
715  int stride_idx = (s & ~1);
716 
718  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
719 
720  int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
721  c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
722  vec_pointer_offset;
723 
724  int access_idx = c + s * ThreadMap::Iterations::kContiguous;
725 
726  char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);
727 
728  *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
729  }
730  }
731  }
732 
734  CUTLASS_DEVICE
735  void store(Fragment const &frag) {
736  store_with_pointer_offset(frag, 0);
737  }
738 };
739 
741 
749 template <
750  typename Shape_,
751  typename Element_,
752  int AdvanceRank,
753  typename ThreadMap_,
754  int Alignment
755 >
757  Shape_,
758  Element_,
759  layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
760  AdvanceRank,
761  ThreadMap_,
762  Alignment> {
763 public:
764 
765  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
766  "Specialization for column-major iterator may along advance along the "
767  "columns(rank=0) or rows(rank=1) dimension.");
768 
769  using Shape = Shape_;
770  using Element = Element_;
772  static int const kAdvanceRank = AdvanceRank;
773 
774  using Index = typename Layout::Index;
775  using LongIndex = typename Layout::LongIndex;
776 
779 
780  using ThreadMap = ThreadMap_;
781 
785  Element,
787  (kAdvanceRank == 0 ? 0 : 1),
788  ThreadMap_>;
789 
790 public:
791 
793  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
794 
795 private:
796 
798  UnderlyingIterator iterator_;
799 
800 public:
801 
805  TensorRef ref,
806  int thread_id
807  ): iterator_({ref.data(), ref.stride()}, thread_id) {
808 
809  }
810 
813  void add_pointer_offset(LongIndex pointer_offset) {
814  iterator_.add_pointer_offset(pointer_offset);
815  }
816 
818  CUTLASS_DEVICE
819  void add_tile_offset(TensorCoord const &coord) {
820  iterator_.add_tile_offset({coord.row(), coord.column()});
821  }
822 
826 
827  ++iterator_;
828  return *this;
829  }
830 
834 
835  RegularTileIterator prev(*this);
836  ++iterator_;
837 
838  return prev;
839  }
840 
842  CUTLASS_DEVICE
843  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
844  iterator_.load_with_pointer_offset(frag, pointer_offset);
845  }
846 
848  CUTLASS_DEVICE
849  void load(Fragment &frag) {
850  load_with_pointer_offset(frag, 0);
851  }
852 
854  CUTLASS_DEVICE
856  Fragment const &frag,
857  Index pointer_offset) {
858 
859  iterator_.store_with_pointer_offset(frag, pointer_offset);
860  }
861 
863  CUTLASS_DEVICE
864  void store(Fragment const &frag) {
865  store_with_pointer_offset(frag, 0);
866  }
867 };
868 
869 
871 
879 template <
880  typename Shape_,
881  typename Element_,
882  int AdvanceRank,
883  typename ThreadMap_,
884  int Alignment
885 >
887  Shape_,
888  Element_,
889  layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
890  AdvanceRank,
891  ThreadMap_,
892  Alignment> {
893 public:
894 
895  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
896  "Specialization for row-major iterator may along advance along the "
897  "columns(rank=0) or rows(rank=1) dimension.");
898 
899  using Shape = Shape_;
900  using Element = Element_;
902  static int const kAdvanceRank = AdvanceRank;
903 
904  using Index = typename Layout::Index;
905  using LongIndex = typename Layout::LongIndex;
906 
909 
910  using ThreadMap = ThreadMap_;
911 
915  Element,
917  (kAdvanceRank == 0 ? 1 : 0),
918  ThreadMap_>;
919 
920 public:
921 
923  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
924 
925 private:
926 
928  UnderlyingIterator iterator_;
929 
930 public:
931 
935  TensorRef ref,
936  int thread_id
937  ): iterator_({ref.data(), ref.stride()}, thread_id) {
938 
939  }
940 
943  void add_pointer_offset(LongIndex pointer_offset) {
944  iterator_.add_pointer_offset(pointer_offset);
945  }
946 
948  CUTLASS_DEVICE
949  void add_tile_offset(TensorCoord const &coord) {
950  iterator_.add_tile_offset({coord.column(), coord.row()});
951  }
952 
956 
957  ++iterator_;
958  return *this;
959  }
960 
964 
965  RegularTileIterator prev(*this);
966  ++iterator_;
967 
968  return prev;
969  }
970 
972  CUTLASS_DEVICE
973  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
974  iterator_.load_with_pointer_offset(frag, pointer_offset);
975  }
976 
978  CUTLASS_DEVICE
979  void load(Fragment &frag) {
980  load_with_pointer_offset(frag, 0);
981  }
982 
984  CUTLASS_DEVICE
986  Fragment const &frag,
987  Index pointer_offset) {
988 
989  iterator_.store_with_pointer_offset(frag, pointer_offset);
990  }
991 
993  CUTLASS_DEVICE
994  void store(Fragment const &frag) {
995  store_with_pointer_offset(frag, 0);
996  }
997 };
998 
999 
1011 template <
1012  typename Shape_,
1013  typename Element_,
1014  int AdvanceRank,
1015  typename ThreadMap_,
1016  int Alignment
1017 >
1019  Shape_, Element_,
1020  layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
1021  Shape_::kContiguous>,
1022  AdvanceRank, ThreadMap_, Alignment> {
1023 
1024  public:
1025  static_assert(
1026  AdvanceRank == 0 || AdvanceRank == 1,
1027  "Specialization for pitch-linear iterator may along advance along the "
1028  "contiguous(rank=0) or strided(rank=1) dimension.");
1029 
1030  using Shape = Shape_;
1031  using Element = Element_;
1032  using Layout =
1034  Shape::kContiguous>;
1035  static int const kAdvanceRank = AdvanceRank;
1036 
1037  using Index = typename Layout::Index;
1038  using LongIndex = typename Layout::LongIndex;
1039 
1042 
1043  using ThreadMap = ThreadMap_;
1044 
1046  struct Detail {
1047 
1049  static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
1050 
1052  static int const kIterarionsPerAccess =
1053  ThreadMap::kElementsPerAccess / Layout::kElementsPerAccess;
1054 
1056  static int const kContiguousElementsPerLine = 4;
1057  };
1058 
1059  private:
1061  using AccessType = Array<Element, Layout::kElementsPerAccess>;
1062 
1063  public:
1065  using Fragment =
1066  Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
1067 
1068  private:
1069  //
1070  // Data members
1071  //
1072 
1076  Index line_size;
1077 
1079  AccessType *pointer_[Detail::kPointerCount];
1080 
1082  Index byte_offset_;
1083 
1084 
1085  public:
1089  int thread_id
1090  )
1091  : line_size(ref.stride(0) * Detail::kContiguousElementsPerLine / Layout::kElementsPerAccess),
1092  byte_offset_(0) {
1093 
1094  layout::PitchLinearCoord thread_offset_base =
1095  ThreadMap::initial_offset(thread_id);
1096 
1098  for (int i = 0; i < Detail::kPointerCount; ++i) {
1099  // This is the offset of a thread within a threadblock tile for a specific
1100  // pointer (units of elements)
1101  layout::PitchLinearCoord thread_offset_in_threadblock_tile =
1102  thread_offset_base +
1104  0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
1105 
1106  // initialize pointer
1107  pointer_[i] = reinterpret_cast<AccessType *>(
1108  ref.data() + ref.offset(thread_offset_in_threadblock_tile));
1109  }
1110  }
1111 
1114  void add_pointer_offset(LongIndex pointer_offset) {
1115  byte_offset_ += pointer_offset * sizeof(Element);
1116  }
1117 
1121  // (Shape::kContiguous/Layout::kElementsPerAccess)*
1122  // line_size * Layout::kElementsPerAccess
1123  add_pointer_offset(Shape::kContiguous * line_size);
1124  return *this;
1125  }
1126 
1130  RegularTileIterator prev(*this);
1131  this->operator++();
1132 
1133  return prev;
1134  }
1135 
1137  CUTLASS_DEVICE
1138  void add_tile_offset(TensorCoord const &coord) {
1139  add_pointer_offset((coord.contiguous() * (Shape::kContiguous / Layout::kElementsPerAccess) *
1140  line_size + coord.strided() * Shape::kStrided) *
1141  Layout::kElementsPerAccess);
1142  }
1143 
1145  CUTLASS_DEVICE
1146  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
1147  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
1148 
1149  Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
1150 
1152  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
1153  AccessType *access_ptr = pointer_[(s & 1) ^ (s / 2)];
1154 
1155  access_ptr += 16 * (s / 2);
1156 
1158  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
1159 
1161  for(int i = 0; i < Detail::kIterarionsPerAccess; ++i) {
1162 
1163  int access_offset =
1164  c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size +
1165  vec_pointer_offset + i * line_size;
1166 
1167  int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
1168  Detail::kIterarionsPerAccess + i;
1169 
1170  char const *access_byte_ptr = reinterpret_cast<char const*>(access_ptr + access_offset);
1171 
1172  frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
1173  access_byte_ptr + byte_offset_);
1174  }
1175  }
1176  }
1177  }
1178 
1180  CUTLASS_DEVICE
1181  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
1182 
1184  CUTLASS_DEVICE
1185  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
1186  AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
1187 
1188  Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
1189 
1191  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
1192  AccessType *access_ptr = pointer_[(s & 1) ^ ((s >> 1) & 1)];
1193 
1194  access_ptr += 16 * (s / 2);
1195 
1197  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
1199  for(int i = 0; i < Detail::kIterarionsPerAccess; ++i) {
1200 
1201  int access_offset =
1202  c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size +
1203  vec_pointer_offset + i * line_size;
1204 
1205  int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
1206  Detail::kIterarionsPerAccess + i;
1207 
1208  char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);
1209 
1210  *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) =
1211  frag_ptr[access_idx];
1212  }
1213  }
1214  }
1215  }
1216 
1218  CUTLASS_DEVICE
1219  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
1220 };
1221 
1223 
1231 template <
1232  typename Shape_,
1233  typename Element_,
1234  int AdvanceRank,
1235  typename ThreadMap_,
1236  int Alignment
1237 >
1238 class RegularTileIterator<Shape_, Element_,
1239  layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
1240  sizeof_bits<Element_>::value, Shape_::kRow>,
1241  AdvanceRank, ThreadMap_, Alignment> {
1242  public:
1243  static_assert(
1244  AdvanceRank == 0 || AdvanceRank == 1,
1245  "Specialization for column-major iterator may along advance along the "
1246  "columns(rank=0) or rows(rank=1) dimension.");
1247 
1248  using Shape = Shape_;
1249  using Element = Element_;
1252  static int const kAdvanceRank = AdvanceRank;
1253 
1254  using Index = typename Layout::Index;
1255  using LongIndex = typename Layout::LongIndex;
1256 
1259 
1260  using ThreadMap = ThreadMap_;
1261 
1266  Shape::kRow>,
1267  (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
1268 
1269  public:
1271  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
1272 
1273  private:
1275  UnderlyingIterator iterator_;
1276 
1277  public:
1281  int thread_id
1282  )
1283  : iterator_({ref.data(), ref.stride()}, thread_id) {}
1284 
1287  void add_pointer_offset(LongIndex pointer_offset) {
1288  iterator_.add_pointer_offset(pointer_offset);
1289  }
1290 
1292  CUTLASS_DEVICE
1293  void add_tile_offset(TensorCoord const &coord) {
1294  iterator_.add_tile_offset({coord.row(), coord.column()});
1295  }
1296 
1300  ++iterator_;
1301  return *this;
1302  }
1303 
1307  RegularTileIterator prev(*this);
1308  ++iterator_;
1309 
1310  return prev;
1311  }
1312 
1314  CUTLASS_DEVICE
1315  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
1316  iterator_.load_with_pointer_offset(frag, pointer_offset);
1317  }
1318 
1320  CUTLASS_DEVICE
1321  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
1322 
1324  CUTLASS_DEVICE
1325  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
1326  iterator_.store_with_pointer_offset(frag, pointer_offset);
1327  }
1328 
1330  CUTLASS_DEVICE
1331  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
1332 };
1333 
1335 
1343 template <
1344  typename Shape_,
1345  typename Element_,
1346  int AdvanceRank,
1347  typename ThreadMap_,
1348  int Alignment
1349 >
1350 class RegularTileIterator<Shape_, Element_,
1351  layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
1352  sizeof_bits<Element_>::value, Shape_::kColumn>,
1353  AdvanceRank, ThreadMap_, Alignment> {
1354  public:
1355  static_assert(
1356  AdvanceRank == 0 || AdvanceRank == 1,
1357  "Specialization for row-major iterator may along advance along the "
1358  "columns(rank=0) or rows(rank=1) dimension.");
1359 
1360  using Shape = Shape_;
1361  using Element = Element_;
1364  static int const kAdvanceRank = AdvanceRank;
1365  static int const kAlignment = Alignment;
1366 
1367  using Index = typename Layout::Index;
1368  using LongIndex = typename Layout::LongIndex;
1369 
1372 
1373  using ThreadMap = ThreadMap_;
1374 
1379  Shape::kColumn>,
1380  (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
1381 
1382  public:
1384  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
1385 
1386  private:
1388  UnderlyingIterator iterator_;
1389 
1390  public:
1394  int thread_id
1395  )
1396  : iterator_({ref.data(), ref.stride()}, thread_id) {}
1397 
1400  void add_pointer_offset(LongIndex pointer_offset) {
1401  iterator_.add_pointer_offset(pointer_offset);
1402  }
1403 
1405  CUTLASS_DEVICE
1406  void add_tile_offset(TensorCoord const &coord) {
1407  iterator_.add_tile_offset({coord.column(), coord.row()});
1408  }
1409 
1413  ++iterator_;
1414  return *this;
1415  }
1416 
1420  RegularTileIterator prev(*this);
1421  ++iterator_;
1422 
1423  return prev;
1424  }
1425 
1427  CUTLASS_DEVICE
1428  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
1429  iterator_.load_with_pointer_offset(frag, pointer_offset);
1430  }
1431 
1433  CUTLASS_DEVICE
1434  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
1435 
1437  CUTLASS_DEVICE
1438  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
1439  iterator_.store_with_pointer_offset(frag, pointer_offset);
1440  }
1441 
1443  CUTLASS_DEVICE
1444  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
1445 };
1446 
1447 
1449 
1450 } // namespace threadblock
1451 } // namespace transform
1452 } // namespace cutlass
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:630
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:537
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:955
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:1384
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:160
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:69
Definition: aligned_buffer.h:35
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
Defines a structure containing strides, bounds, and a pointer to tensor data.
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:833
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:963
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1315
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:471
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:859
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:843
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:825
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:1393
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:167
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:451
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:501
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:321
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:540
Definition: tensor_op_multiplicand_sm70.h:848
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1185
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:639
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:610
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:703
Array< Element, ThreadMap::Iterations::kCount *Layout::kElementsPerAccess > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:117
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:491
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:642
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:462
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:1114
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:985
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:200
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:528
Defines the size of an element in bits.
Definition: numeric_types.h:42
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:483
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:405
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:1280
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:195
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:203
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:176
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:383
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:943
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:667
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:951
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1428
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1146
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:293
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:305
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:397
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1438
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:263
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:302
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:923
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:1088
Definition: regular_tile_iterator.h:50
#define static_assert(__e, __m)
Definition: platform.h:153
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:855
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:72
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:341
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1325
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:741
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:973
Definition: tensor_op_multiplicand_sm70.h:733
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1129
Definition: tensor_op_multiplicand_sm70.h:943
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:639
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:332
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:632
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:744
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:954
Defines a canonical coordinate for rank=2 matrices offering named indices.
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:353
Templates implementing storing of tiles from pitch-linear rank=2 tensors.
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:371
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:1400
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:1287
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:1271
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:793
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:856
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:231
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:191
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:648
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:138
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:361
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:408
Array< Element, ThreadMap::Iterations::kCount *ThreadMap::kElementsPerAccess > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:1066
Array< Element, ThreadMap::Iterations::kCount *Layout::kElementsPerAccess > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op_sm70.h:589
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:513
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:60
Basic include for CUTLASS.
Definition: matrix_coord.h:39
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:804
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op_sm70.h:934
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op_sm70.h:1120
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op_sm70.h:813