CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
regular_tile_access_iterator_tensor_op.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
30 #pragma once
31 
32 #include "cutlass/array.h"
33 #include "cutlass/cutlass.h"
36 #include "cutlass/matrix_coord.h"
37 #include "cutlass/matrix_shape.h"
38 #include "cutlass/tensor_ref.h"
40 
42 
43 namespace cutlass {
44 namespace transform {
45 namespace threadblock {
46 
48 
56 template <typename Shape_, typename Element_, int AdvanceRank,
57  typename ThreadMap_, int Alignment>
59  Shape_, Element_,
60  layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
61  int(128 / sizeof(Element_))>,
62  AdvanceRank, ThreadMap_, Alignment> {
63  public:
65  AdvanceRank == 0 || AdvanceRank == 1,
66  "Specialization for pitch-linear iterator may along advance along the "
67  "contiguous(rank=0) or strided(rank=1) dimension.");
68 
69  using Shape = Shape_;
70  using Element = Element_;
71  using Layout =
73  int(128 / sizeof(Element_))>;
74  static int const kAdvanceRank = AdvanceRank;
75  static int const kAlignment = Alignment;
76 
77  using Index = typename Layout::Index;
78  using LongIndex = typename Layout::LongIndex;
79 
81  using TensorCoord = typename Layout::TensorCoord;
82 
83  using ThreadMap = ThreadMap_;
84 
86  struct Detail {
89  static int const kAccessSizeInBits = 128;
90 
92  ThreadMap::kElementsPerAccess ==
93  kAccessSizeInBits,
94  "This iterator requires a policy whose access size is 128bs");
95 
97  static int const kPointerCount =
98  (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
99  };
100 
102  using AccessType = Array<Element, Layout::kElementsPerAccess>;
103 
104  private:
105  //
106  // Data members
107  //
108 
110  Index stride_;
111 
113  AccessType *pointer_[Detail::kPointerCount];
114 
116  Index byte_offset_;
117 
119  int iteration_contiguous_;
120 
122  int iteration_strided_;
123 
124  public:
128  int thread_id
129  )
130  : stride_(ref.stride(0) / Layout::kElementsPerAccess),
131  byte_offset_(0) {
132  layout::PitchLinearCoord thread_offset_base =
133  ThreadMap::initial_offset(thread_id);
134 
136  for (int i = 0; i < Detail::kPointerCount; ++i) {
137  // This is the offset of a thread within a threadblock tile for a specific
138  // pointer (units of elements)
139  layout::PitchLinearCoord thread_offset_in_threadblock_tile =
140  thread_offset_base +
142  0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
143 
144  // initialize pointer
145  pointer_[i] = reinterpret_cast<AccessType *>(
146  ref.data() + ref.offset(thread_offset_in_threadblock_tile));
147  }
148 
149  set_iteration_index(0);
150  }
151 
154  void set_iteration_index(int index) {
155  iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
156  iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
157  }
158 
161  void add_pointer_offset(LongIndex pointer_offset) {
162  byte_offset_ += pointer_offset * sizeof(Element);
163  }
164 
167  AccessType *get() const {
168  AccessType *access_ptr = pointer_[iteration_strided_ & 1];
169  int stride_idx = (iteration_strided_ & ~1);
170 
171  int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
172  iteration_contiguous_ * ThreadMap::Delta::kContiguous /
173  ThreadMap::kElementsPerAccess;
174 
175  char *access_byte_ptr =
176  reinterpret_cast<char *>(access_ptr + access_offset);
177  return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
178  }
179 
183  ++iteration_contiguous_;
184 
185  if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
186  return *this;
187 
188  // Enter here only if (iteration_contiguous_ ==
189  // ThreadMap::Iteration::kContiguous)
190  iteration_contiguous_ = 0;
191  ++iteration_strided_;
192 
193  if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
194  return *this;
195  }
196 
197  // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
198  // which means we enter the next tile.
199  iteration_strided_ = 0;
200 
201  return *this;
202  }
203 
207  RegularTileAccessIterator prev(*this);
208  this->operator++();
209 
210  return prev;
211  }
212 
214  CUTLASS_DEVICE
215  void add_tile_offset(TensorCoord const &coord) {
216  add_pointer_offset(coord.contiguous() * Shape::kContiguous +
217  coord.strided() * Shape::kStrided * stride_ *
218  Layout::kElementsPerAccess);
219  }
220 };
221 
223 
231 template <typename Shape_, typename Element_, int AdvanceRank,
232  typename ThreadMap_, int Alignment>
234  Shape_, Element_,
235  layout::ColumnMajorTensorOpMultiplicandCongruous<
236  sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
237  AdvanceRank, ThreadMap_, Alignment> {
238  public:
240  AdvanceRank == 0 || AdvanceRank == 1,
241  "Specialization for column-major iterator may along advance along the "
242  "columns(rank=0) or rows(rank=1) dimension.");
243 
244  using Shape = Shape_;
245  using Element = Element_;
247  sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>;
248  static int const kAdvanceRank = AdvanceRank;
249  static int const kAlignment = Alignment;
250 
251  using Index = typename Layout::Index;
252  using LongIndex = typename Layout::LongIndex;
253 
256 
257  using ThreadMap = ThreadMap_;
258 
263  int(128 / sizeof(Element_))>,
264  (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
265 
266  using AccessType = typename UnderlyingIterator::AccessType;
267 
268  private:
270  UnderlyingIterator iterator_;
271 
272  public:
276  int thread_id
277  )
278  : iterator_({ref.data(), ref.stride()}, thread_id) {}
279 
282  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
283 
286  void add_pointer_offset(LongIndex pointer_offset) {
287  iterator_.add_pointer_offset(pointer_offset);
288  }
289 
292  AccessType *get() const {
293  return reinterpret_cast<AccessType *>(iterator_.get());
294  }
295 
297  CUTLASS_DEVICE
298  void add_tile_offset(TensorCoord const &coord) {
299  iterator_.add_tile_offset({coord.row(), coord.column()});
300  }
301 
305  ++iterator_;
306  return *this;
307  }
308 
312  RegularTileAccessIterator prev(*this);
313  ++iterator_;
314 
315  return prev;
316  }
317 };
318 
320 
328 template <typename Shape_, typename Element_, int AdvanceRank,
329  typename ThreadMap_, int Alignment>
331  Shape_, Element_,
332  layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
333  int(128 / sizeof(Element_))>,
334  AdvanceRank, ThreadMap_, Alignment> {
335  public:
337  AdvanceRank == 0 || AdvanceRank == 1,
338  "Specialization for row-major iterator may along advance along the "
339  "columns(rank=0) or rows(rank=1) dimension.");
340 
341  using Shape = Shape_;
342  using Element = Element_;
344  sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>;
345  static int const kAdvanceRank = AdvanceRank;
346  static int const kAlignment = Alignment;
347 
348  using Index = typename Layout::Index;
349  using LongIndex = typename Layout::LongIndex;
350 
353 
354  using ThreadMap = ThreadMap_;
355 
359  layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
360  int(128 / sizeof(Element_))>,
361  (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
362 
363  using AccessType = typename UnderlyingIterator::AccessType;
364 
365  private:
367  UnderlyingIterator iterator_;
368 
369  public:
373  int thread_id
374  )
375  : iterator_({ref.data(), ref.stride()}, thread_id) {}
376 
379  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
380 
383  void add_pointer_offset(LongIndex pointer_offset) {
384  iterator_.add_pointer_offset(pointer_offset);
385  }
386 
389  AccessType *get() const {
390  return reinterpret_cast<AccessType *>(iterator_.get());
391  }
392 
394  CUTLASS_DEVICE
395  void add_tile_offset(TensorCoord const &coord) {
396  iterator_.add_tile_offset({coord.column(), coord.row()});
397  }
398 
402  ++iterator_;
403  return *this;
404  }
405 
409  RegularTileAccessIterator prev(*this);
410  ++iterator_;
411 
412  return prev;
413  }
414 };
415 
417 
425 template <typename Shape_, typename Element_, int AdvanceRank,
426  typename ThreadMap_, int Alignment, int Crosswise>
427 class RegularTileAccessIterator<Shape_, Element_,
428  layout::TensorOpMultiplicandCrosswise<
429  sizeof_bits<Element_>::value, Crosswise>,
430  AdvanceRank, ThreadMap_, Alignment> {
431  public:
433  AdvanceRank == 0 || AdvanceRank == 1,
434  "Specialization for pitch-linear iterator may along advance along the "
435  "contiguous(rank=0) or strided(rank=1) dimension.");
436 
437  using Shape = Shape_;
438  using Element = Element_;
439  using Layout =
441  Crosswise>;
442  static int const kAdvanceRank = AdvanceRank;
443  static int const kAlignment = Alignment;
444  static int const kCrosswise = Crosswise;
445 
446  using Index = typename Layout::Index;
447  using LongIndex = typename Layout::LongIndex;
448 
451 
452  using ThreadMap = ThreadMap_;
453 
455  struct Detail {
458  static int const kAccessSizeInBits = 128;
459 
461  ThreadMap::kElementsPerAccess ==
462  kAccessSizeInBits,
463  "This iterator requires a policy whose access size is 128bs");
464 
469  static int const kPointerCount =
470  (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
471  };
472 
474  using AccessType = Array<Element, Layout::kElementsPerAccess>;
475 
476  private:
477  //
478  // Data members
479  //
480 
485  int sections_;
486 
488  int sections_per_stage_;
489 
491  Index stride_;
492 
494  AccessType *pointer_[Detail::kPointerCount];
495 
497  Index byte_offset_;
498 
500  int iteration_contiguous_;
501 
503  int iteration_strided_;
504 
505  public:
509  int thread_id
510  )
511  : sections_(ref.stride(0) / kCrosswise),
512  sections_per_stage_(Shape::kContiguous / kCrosswise),
513  // stride_ = kCrosswise x sections_ x kFactor
514  stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
515  byte_offset_(0) {
516  layout::PitchLinearCoord thread_offset_base =
517  ThreadMap::initial_offset(thread_id);
518 
520  for (int i = 0; i < Detail::kPointerCount; ++i) {
521  // This is the offset of a thread within a threadblock tile for a specific
522  // pointer (units of elements)
523  layout::PitchLinearCoord thread_offset_in_threadblock_tile =
524  thread_offset_base +
526  0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
527  // initialize pointer
528  pointer_[i] = reinterpret_cast<AccessType *>(ref.data()) +
529  ref.offset(thread_offset_in_threadblock_tile) /
530  Layout::kElementsPerAccess;
531  }
532 
533  set_iteration_index(0);
534  }
535 
538  void set_iteration_index(int index) {
539  iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
540  iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
541  }
542 
545  void add_pointer_offset(LongIndex pointer_offset) {
546  byte_offset_ += pointer_offset * sizeof_bits<Element>::value / 8;
547  }
548 
551  AccessType *get() const {
552  AccessType *access_ptr = pointer_[iteration_strided_ & 1];
553  int stride_idx = (iteration_strided_ & ~1);
554 
555  int access_offset =
556  stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
557  iteration_contiguous_ * ThreadMap::Delta::kContiguous /
558  ThreadMap::kElementsPerAccess;
559  char *access_byte_ptr =
560  reinterpret_cast<char *>(access_ptr + access_offset);
561  return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
562  }
563 
567  ++iteration_contiguous_;
568 
569  if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
570  return *this;
571 
572  // Enter here only if (iteration_contiguous_ ==
573  // ThreadMap::Iteration::kContiguous)
574  iteration_contiguous_ = 0;
575  ++iteration_strided_;
576 
577  if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
578  return *this;
579  }
580 
581  // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
582  // which means we enter the next section.
583  iteration_strided_ = 0;
584 
585  return *this;
586  }
587 
591  RegularTileAccessIterator prev(*this);
592  this->operator++();
593 
594  return prev;
595  }
596 
598  CUTLASS_DEVICE
599  void add_tile_offset(TensorCoord const &coord) {
600  add_pointer_offset(coord.contiguous() * sections_per_stage_ * stride_ *
601  ThreadMap::kElementsPerAccess / sections_ +
602  coord.strided() * Shape::kStrided * stride_ *
603  Layout::kElementsPerAccess);
604  }
605 };
606 
608 
616 template <typename Shape_, typename Element_, int AdvanceRank,
617  typename ThreadMap_, int Alignment, int Crosswise>
619  Shape_, Element_,
620  layout::ColumnMajorTensorOpMultiplicandCrosswise<
621  sizeof_bits<Element_>::value, Crosswise>,
622  AdvanceRank, ThreadMap_, Alignment> {
623  public:
625  AdvanceRank == 0 || AdvanceRank == 1,
626  "Specialization for column-major iterator may along advance along the "
627  "columns(rank=0) or rows(rank=1) dimension.");
628 
629  using Shape = Shape_;
630  using Element = Element_;
633  static int const kAdvanceRank = AdvanceRank;
634  static int const kAlignment = Alignment;
635 
636  using Index = typename Layout::Index;
637  using LongIndex = typename Layout::LongIndex;
638 
641 
642  using ThreadMap = ThreadMap_;
643 
646  layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
648  Crosswise>,
649  (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
650 
651  using AccessType = typename UnderlyingIterator::AccessType;
652 
653  private:
655  UnderlyingIterator iterator_;
656 
657  public:
661  int thread_id
662  )
663  : iterator_({ref.data(), ref.stride()}, thread_id) {}
664 
667  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
668 
671  void add_pointer_offset(LongIndex pointer_offset) {
672  iterator_.add_pointer_offset(pointer_offset);
673  }
674 
677  AccessType *get() const {
678  return reinterpret_cast<AccessType *>(iterator_.get());
679  }
680 
682  CUTLASS_DEVICE
683  void add_tile_offset(TensorCoord const &coord) {
684  iterator_.add_tile_offset({coord.row(), coord.column()});
685  }
686 
690  ++iterator_;
691  return *this;
692  }
693 
697  RegularTileAccessIterator prev(*this);
698  ++iterator_;
699 
700  return prev;
701  }
702 };
703 
705 
713 template <typename Shape_, typename Element_, int AdvanceRank,
714  typename ThreadMap_, int Alignment, int Crosswise>
715 class RegularTileAccessIterator<Shape_, Element_,
716  layout::RowMajorTensorOpMultiplicandCrosswise<
717  sizeof_bits<Element_>::value, Crosswise>,
718  AdvanceRank, ThreadMap_, Alignment> {
719  public:
721  AdvanceRank == 0 || AdvanceRank == 1,
722  "Specialization for row-major iterator may along advance along the "
723  "columns(rank=0) or rows(rank=1) dimension.");
724 
725  using Shape = Shape_;
726  using Element = Element_;
729  static int const kAdvanceRank = AdvanceRank;
730  static int const kAlignment = Alignment;
731 
732  using Index = typename Layout::Index;
733  using LongIndex = typename Layout::LongIndex;
734 
737 
738  using ThreadMap = ThreadMap_;
739 
742  layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
744  Crosswise>,
745  (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
746 
747  using AccessType = typename UnderlyingIterator::AccessType;
748 
749  private:
751  UnderlyingIterator iterator_;
752 
753  public:
757  int thread_id
758  )
759  : iterator_({ref.data(), ref.stride()}, thread_id) {}
760 
763  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
764 
767  void add_pointer_offset(LongIndex pointer_offset) {
768  iterator_.add_pointer_offset(pointer_offset);
769  }
770 
773  AccessType *get() const {
774  return reinterpret_cast<AccessType *>(iterator_.get());
775  }
776 
778  CUTLASS_DEVICE
779  void add_tile_offset(TensorCoord const &coord) {
780  iterator_.add_tile_offset({coord.column(), coord.row()});
781  }
782 
786  ++iterator_;
787  return *this;
788  }
789 
793  RegularTileAccessIterator prev(*this);
794  ++iterator_;
795 
796  return prev;
797  }
798 };
799 
801 } // namespace threadblock
802 } // namespace transform
803 } // namespace cutlass
804 
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:434
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_tensor_op.h:508
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_tensor_op.h:275
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:640
Definition: aligned_buffer.h:35
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: regular_tile_access_iterator_tensor_op.h:538
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
Defines a structure containing strides, bounds, and a pointer to tensor data.
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:566
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:431
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:536
Definition: tensor_op_multiplicand_sm75.h:734
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:539
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_tensor_op.h:372
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:221
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_tensor_op.h:127
Definition: tensor_op_multiplicand_sm75.h:422
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:843
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:742
Definition: tensor_op_multiplicand_sm75.h:835
Definition: tensor_op_multiplicand_sm75.h:213
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:590
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_tensor_op.h:545
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:792
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_tensor_op.h:161
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:224
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_tensor_op.h:767
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:846
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277
Defines a Shape template for matrix tiles.
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:745
Defines the size of an element in bits.
Definition: numeric_types.h:42
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:206
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:689
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:182
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301
#define static_assert(__e, __m)
Definition: platform.h:153
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_tensor_op.h:671
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:643
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:696
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_tensor_op.h:383
Templates implementing the address computation of storing of tiles from pitch-linear rank=2 tensors...
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_tensor_op.h:660
Defines a canonical coordinate for rank=2 matrices offering named indices.
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_tensor_op.h:286
Definition: regular_tile_access_iterator.h:48
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:785
Definition: tensor_op_multiplicand_sm75.h:632
Basic include for CUTLASS.
Definition: matrix_coord.h:39
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_tensor_op.h:756
Definition: tensor_op_multiplicand_sm75.h:527