CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
regular_tile_iterator_tensor_op.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
33 
35 
36 namespace cutlass {
37 namespace transform {
38 namespace threadblock {
39 
41 
49 template <typename Shape_, typename Element_, int AdvanceRank,
50  typename ThreadMap_, int Alignment>
52  Shape_, Element_,
53  layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
54  int(128 / sizeof(Element_))>,
55  AdvanceRank, ThreadMap_, Alignment> {
56  public:
57 
58  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
59  "Specialization for pitch-linear iterator may along advance along the "
60  "contiguous(rank=0) or strided(rank=1) dimension.");
61 
62  using Shape = Shape_;
63  using Element = Element_;
64  using Layout =
66  int(128 / sizeof(Element))>;
67  static int const kAdvanceRank = AdvanceRank;
68  static int const kAlignment = Alignment;
69 
70  using Index = typename Layout::Index;
71  using LongIndex = typename Layout::LongIndex;
72 
74  using TensorCoord = typename Layout::TensorCoord;
75 
76  using ThreadMap = ThreadMap_;
77 
79  struct Detail {
80 
82  static int const kAccessSizeInBits = 128;
83 
85  sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
86  "This iterator requires a policy whose access size is 128bs");
87  };
88 
89 private:
90 
92  using AccessType = Array<Element, Layout::kElementsPerAccess>;
93 
94 public:
95 
97  using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
98 
101  kAdvanceRank, ThreadMap>;
102 
103 private:
104 
105  //
106  // Data members
107  //
108 
110  TileAccessIterator address_iterator_;
111 
112 public:
113 
117  int thread_id
118  )
119  : address_iterator_(ref, thread_id) {}
120 
123  void add_pointer_offset(LongIndex pointer_offset) {
124  address_iterator_.add_pointer_offset(pointer_offset);
125  }
126 
130  address_iterator_.add_tile_offset({0, 1});
131  return *this;
132  }
133 
137  RegularTileIterator prev(*this);
138  this->operator++();
139 
140  return prev;
141  }
142 
144  CUTLASS_DEVICE
145  void add_tile_offset(TensorCoord const &coord) {
146  address_iterator_.add_tile_offset(coord);
147  }
148 
150  CUTLASS_DEVICE
151  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
152  address_iterator_.set_iteration_index(0);
153  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
154 
156  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
158  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
159  int access_idx = c + s * ThreadMap::Iterations::kContiguous;
160  frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
161  ++address_iterator_;
162  }
163  }
164  }
165 
167  CUTLASS_DEVICE
168  void load(Fragment &frag) {
169  load_with_pointer_offset(frag, 0);
170  }
171 
173  CUTLASS_DEVICE
174  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
175  address_iterator_.set_iteration_index(0);
176  AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
177 
179  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
181  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
182  int access_idx = c + s * ThreadMap::Iterations::kContiguous;
183  *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
184  ++address_iterator_;
185  }
186  }
187  }
188 
190  CUTLASS_DEVICE
191  void store(Fragment const &frag) {
192  store_with_pointer_offset(frag, 0);
193  }
194 };
195 
197 
205 template <typename Shape_, typename Element_, int AdvanceRank,
206  typename ThreadMap_, int Alignment>
208  Shape_, Element_,
209  layout::ColumnMajorTensorOpMultiplicandCongruous<
210  sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
211  AdvanceRank, ThreadMap_, Alignment> {
212  public:
213 
214  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
215  "Specialization for column-major iterator may along advance along the "
216  "columns(rank=0) or rows(rank=1) dimension.");
217 
218  using Shape = Shape_;
219  using Element = Element_;
221  sizeof_bits<Element_>::value, int(128 / sizeof(Element))>;
222  static int const kAdvanceRank = AdvanceRank;
223  static int const kAlignment = Alignment;
224 
225  using Index = typename Layout::Index;
226  using LongIndex = typename Layout::LongIndex;
227 
230 
231  using ThreadMap = ThreadMap_;
232 
237  int(128 / sizeof(Element))>,
238  (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
239 
240  public:
241 
243  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
244 
245 private:
246 
248  UnderlyingIterator iterator_;
249 
250 public:
251 
255  TensorRef ref,
256  int thread_id
257  ): iterator_({ref.data(), ref.stride()}, thread_id) {
258 
259  }
260 
263  void add_pointer_offset(LongIndex pointer_offset) {
264  iterator_.add_pointer_offset(pointer_offset);
265  }
266 
268  CUTLASS_DEVICE
269  void add_tile_offset(TensorCoord const &coord) {
270  iterator_.add_tile_offset({coord.row(), coord.column()});
271  }
272 
276  ++iterator_;
277  return *this;
278  }
279 
283  RegularTileIterator prev(*this);
284  ++iterator_;
285 
286  return prev;
287  }
288 
290  CUTLASS_DEVICE
291  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
292  iterator_.load_with_pointer_offset(frag, pointer_offset);
293  }
294 
296  CUTLASS_DEVICE
297  void load(Fragment &frag) {
298  load_with_pointer_offset(frag, 0);
299  }
300 
302  CUTLASS_DEVICE
304  Fragment const &frag,
305  Index pointer_offset) {
306 
307  iterator_.store_with_pointer_offset(frag, pointer_offset);
308  }
309 
311  CUTLASS_DEVICE
312  void store(Fragment const &frag) {
313  store_with_pointer_offset(frag, 0);
314  }
315 };
316 
318 
326 template <typename Shape_, typename Element_, int AdvanceRank,
327  typename ThreadMap_, int Alignment>
329  Shape_, Element_,
330  layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
331  int(128 / sizeof(Element_))>,
332  AdvanceRank, ThreadMap_, Alignment> {
333  public:
334 
335  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
336  "Specialization for row-major iterator may along advance along the "
337  "columns(rank=0) or rows(rank=1) dimension.");
338 
339  using Shape = Shape_;
340  using Element = Element_;
342  sizeof_bits<Element_>::value, int(128 / sizeof(Element))>;
343  static int const kAdvanceRank = AdvanceRank;
344  static int const kAlignment = Alignment;
345 
346  using Index = typename Layout::Index;
347  using LongIndex = typename Layout::LongIndex;
348 
351 
352  using ThreadMap = ThreadMap_;
353 
357  layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
358  int(128 / sizeof(Element))>,
359  (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
360 
361  public:
362 
364  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
365 
366 private:
367 
369  UnderlyingIterator iterator_;
370 
371 public:
372 
376  TensorRef ref,
377  int thread_id
378  ): iterator_({ref.data(), ref.stride()}, thread_id) {
379 
380  }
381 
384  void add_pointer_offset(LongIndex pointer_offset) {
385  iterator_.add_pointer_offset(pointer_offset);
386  }
387 
389  CUTLASS_DEVICE
390  void add_tile_offset(TensorCoord const &coord) {
391  iterator_.add_tile_offset({coord.column(), coord.row()});
392  }
393 
397 
398  ++iterator_;
399  return *this;
400  }
401 
405 
406  RegularTileIterator prev(*this);
407  ++iterator_;
408 
409  return prev;
410  }
411 
413  CUTLASS_DEVICE
414  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
415  iterator_.load_with_pointer_offset(frag, pointer_offset);
416  }
417 
419  CUTLASS_DEVICE
420  void load(Fragment &frag) {
421  load_with_pointer_offset(frag, 0);
422  }
423 
425  CUTLASS_DEVICE
427  Fragment const &frag,
428  Index pointer_offset) {
429 
430  iterator_.store_with_pointer_offset(frag, pointer_offset);
431  }
432 
434  CUTLASS_DEVICE
435  void store(Fragment const &frag) {
436  store_with_pointer_offset(frag, 0);
437  }
438 };
439 
441 
449 template <typename Shape_, typename Element_, int AdvanceRank,
450  typename ThreadMap_, int Alignment, int Crosswise>
451 class RegularTileIterator<Shape_, Element_,
452  layout::TensorOpMultiplicandCrosswise<
453  sizeof_bits<Element_>::value, Crosswise>,
454  AdvanceRank, ThreadMap_, Alignment> {
455  public:
457  AdvanceRank == 0 || AdvanceRank == 1,
458  "Specialization for pitch-linear iterator may along advance along the "
459  "contiguous(rank=0) or strided(rank=1) dimension.");
460 
461  using Shape = Shape_;
462  using Element = Element_;
463  using Layout =
465  Crosswise>;
466 
467  static int const kAdvanceRank = AdvanceRank;
468  static int const kAlignment = Alignment;
469 
470  using Index = typename Layout::Index;
471  using LongIndex = typename Layout::LongIndex;
472 
475 
476  using ThreadMap = ThreadMap_;
477 
479  struct Detail {
482  static int const kAccessSizeInBits = 128;
483 
484  static_assert(sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess ==
485  kAccessSizeInBits,
486  "This iterator requires a policy whose access size is 128bs");
487  };
488 
489  private:
491  using AccessType = Array<Element, Layout::kElementsPerAccess>;
492 
493  public:
495  using Fragment =
496  Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
497 
500  kAdvanceRank, ThreadMap>;
501 
502  private:
503  //
504  // Data members
505  //
506 
508  TileAccessIterator address_iterator_;
509 
510  public:
514  int thread_id
515  )
516  : address_iterator_(ref, thread_id) {}
517 
520  void add_pointer_offset(LongIndex pointer_offset) {
521  address_iterator_.add_pointer_offset(pointer_offset);
522  }
523 
527  address_iterator_.add_tile_offset({1, 0});
528  return *this;
529  }
530 
534  RegularTileIterator prev(*this);
535  this->operator++();
536 
537  return prev;
538  }
539 
541  CUTLASS_DEVICE
542  void add_tile_offset(TensorCoord const &coord) {
543  address_iterator_.add_tile_offset(coord);
544  }
545 
547  CUTLASS_DEVICE
548  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
549  address_iterator_.set_iteration_index(0);
550  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
551 
553  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
555  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
556  int access_idx = c + s * ThreadMap::Iterations::kContiguous;
557  frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
558  ++address_iterator_;
559  }
560  }
561  }
562 
564  CUTLASS_DEVICE
565  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
566 
568  CUTLASS_DEVICE
569  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
570  address_iterator_.set_iteration_index(0);
571  AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
572 
574  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
576  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
577  int access_idx = c + s * ThreadMap::Iterations::kContiguous;
578  *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
579  ++address_iterator_;
580  }
581  }
582  }
583 
585  CUTLASS_DEVICE
586  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
587 };
588 
590 
598 template <typename Shape_, typename Element_, int AdvanceRank,
599  typename ThreadMap_, int Alignment, int Crosswise>
600 class RegularTileIterator<Shape_, Element_,
601  layout::ColumnMajorTensorOpMultiplicandCrosswise<
602  sizeof_bits<Element_>::value, Crosswise>,
603  AdvanceRank, ThreadMap_, Alignment> {
604  public:
606  AdvanceRank == 0 || AdvanceRank == 1,
607  "Specialization for column-major iterator may along advance along the "
608  "columns(rank=0) or rows(rank=1) dimension.");
609 
610  using Shape = Shape_;
611  using Element = Element_;
614  static int const kAdvanceRank = AdvanceRank;
615  static int const kAlignment = Alignment;
616 
617  using Index = typename Layout::Index;
618  using LongIndex = typename Layout::LongIndex;
619 
622 
623  using ThreadMap = ThreadMap_;
624 
627  layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
629  Crosswise>,
630  (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
631 
632  public:
634  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
635 
636  private:
638  UnderlyingIterator iterator_;
639 
640  public:
644  int thread_id
645  )
646  : iterator_({ref.data(), ref.stride()}, thread_id) {}
647 
650  void add_pointer_offset(LongIndex pointer_offset) {
651  iterator_.add_pointer_offset(pointer_offset);
652  }
653 
655  CUTLASS_DEVICE
656  void add_tile_offset(TensorCoord const &coord) {
657  iterator_.add_tile_offset({coord.row(), coord.column()});
658  }
659 
663  ++iterator_;
664  return *this;
665  }
666 
670  RegularTileIterator prev(*this);
671  ++iterator_;
672 
673  return prev;
674  }
675 
677  CUTLASS_DEVICE
678  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
679  iterator_.load_with_pointer_offset(frag, pointer_offset);
680  }
681 
683  CUTLASS_DEVICE
684  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
685 
687  CUTLASS_DEVICE
688  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
689  iterator_.store_with_pointer_offset(frag, pointer_offset);
690  }
691 
693  CUTLASS_DEVICE
694  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
695 };
696 
698 
706 template <typename Shape_, typename Element_, int AdvanceRank,
707  typename ThreadMap_, int Alignment, int Crosswise>
708 class RegularTileIterator<Shape_, Element_,
709  layout::RowMajorTensorOpMultiplicandCrosswise<
710  sizeof_bits<Element_>::value, Crosswise>,
711  AdvanceRank, ThreadMap_, Alignment> {
712  public:
714  AdvanceRank == 0 || AdvanceRank == 1,
715  "Specialization for row-major iterator may along advance along the "
716  "columns(rank=0) or rows(rank=1) dimension.");
717 
718  using Shape = Shape_;
719  using Element = Element_;
722  static int const kAdvanceRank = AdvanceRank;
723  static int const kAlignment = Alignment;
724 
725  using Index = typename Layout::Index;
726  using LongIndex = typename Layout::LongIndex;
727 
730 
731  using ThreadMap = ThreadMap_;
732 
735  layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
737  Crosswise>,
738  (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
739 
740  public:
742  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
743 
744  private:
746  UnderlyingIterator iterator_;
747 
748  public:
752  int thread_id
753  )
754  : iterator_({ref.data(), ref.stride()}, thread_id) {}
755 
758  void add_pointer_offset(LongIndex pointer_offset) {
759  iterator_.add_pointer_offset(pointer_offset);
760  }
761 
763  CUTLASS_DEVICE
764  void add_tile_offset(TensorCoord const &coord) {
765  iterator_.add_tile_offset({coord.column(), coord.row()});
766  }
767 
771  ++iterator_;
772  return *this;
773  }
774 
778  RegularTileIterator prev(*this);
779  ++iterator_;
780 
781  return prev;
782  }
783 
785  CUTLASS_DEVICE
786  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
787  iterator_.load_with_pointer_offset(frag, pointer_offset);
788  }
789 
791  CUTLASS_DEVICE
792  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
793 
795  CUTLASS_DEVICE
796  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
797  iterator_.store_with_pointer_offset(frag, pointer_offset);
798  }
799 
801  CUTLASS_DEVICE
802  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
803 };
804 
806 } // namespace threadblock
807 } // namespace transform
808 } // namespace cutlass
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:434
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op.h:123
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:640
Array< Element, ThreadMap::Iterations::kCount *Layout::kElementsPerAccess > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op.h:97
Definition: aligned_buffer.h:35
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op.h:384
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op.h:758
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:431
Array< Element, ThreadMap::Iterations::kCount *Layout::kElementsPerAccess > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op.h:496
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:536
Definition: tensor_op_multiplicand_sm75.h:734
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:539
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op.h:364
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:221
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op.h:520
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op.h:634
Definition: tensor_op_multiplicand_sm75.h:422
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op.h:375
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:843
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:742
Definition: tensor_op_multiplicand_sm75.h:835
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op.h:243
Definition: tensor_op_multiplicand_sm75.h:213
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:533
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op.h:751
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:224
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:846
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:414
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:745
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:796
Defines the size of an element in bits.
Definition: numeric_types.h:42
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op.h:263
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:678
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op.h:650
Templates implementing computing the addresses of storing of tiles from pitch-linear rank=2 tensors...
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:688
Definition: regular_tile_iterator.h:50
#define static_assert(__e, __m)
Definition: platform.h:153
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:303
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:786
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:643
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:426
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:569
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op.h:513
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:174
Templates implementing storing of tiles from pitch-linear rank=2 tensors.
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:526
Definition: tensor_op_multiplicand_sm75.h:632
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op.h:742
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:548
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op.h:116
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op.h:643
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op.h:254
Definition: matrix_coord.h:39
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:151
Definition: tensor_op_multiplicand_sm75.h:527