CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
regular_tile_access_iterator_pitch_linear.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
30 #pragma once
31 
32 #include "cutlass/cutlass.h"
33 #include "cutlass/array.h"
35 #include "cutlass/layout/matrix.h"
36 #include "cutlass/matrix_coord.h"
37 #include "cutlass/matrix_shape.h"
38 #include "cutlass/tensor_ref.h"
39 
41 
43 
44 namespace cutlass {
45 namespace transform {
46 namespace threadblock {
47 
49 
57 template <typename Shape_, typename Element_, int AdvanceRank,
58  typename ThreadMap_, int Alignment>
60  Shape_, Element_,
61  layout::PitchLinear,
62  AdvanceRank, ThreadMap_, Alignment> {
63  public:
65  AdvanceRank == 0 || AdvanceRank == 1,
66  "Specialization for pitch-linear iterator may along advance along the "
67  "contiguous(rank=0) or strided(rank=1) dimension.");
68 
69  using Shape = Shape_;
70  using Element = Element_;
72  static int const kAdvanceRank = AdvanceRank;
73  static int const kAlignment = Alignment;
74 
75  using Index = typename Layout::Index;
76  using LongIndex = typename Layout::LongIndex;
77 
79  using TensorCoord = typename Layout::TensorCoord;
80 
81  using ThreadMap = ThreadMap_;
82 
84  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
85 
86  private:
87  //
88  // Data members
89  //
90 
92  Index stride_;
93 
95  AccessType *pointer_;
96 
98  Index byte_offset_;
99 
101  int iteration_contiguous_;
102 
104  int iteration_strided_;
105 
106  public:
110  int thread_id
111  )
112  : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
113  byte_offset_(0) {
114 
115  layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
116 
117  // initialize pointer
118  pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
119 
120  set_iteration_index(0);
121  }
122 
125  void set_iteration_index(int index) {
126  iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
127  iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
128  }
129 
132  void add_pointer_offset(LongIndex pointer_offset) {
133  byte_offset_ += pointer_offset * sizeof(Element);
134  }
135 
137  CUTLASS_DEVICE
138  AccessType *get() const {
139 
140  AccessType *access_ptr = pointer_;
141 
142  int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
143  iteration_contiguous_ * ThreadMap::Delta::kContiguous /
144  ThreadMap::kElementsPerAccess;
145 
146  char *access_byte_ptr =
147  reinterpret_cast<char *>(access_ptr + access_offset);
148 
149  return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
150  }
151 
155  ++iteration_contiguous_;
156 
157  if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
158  return *this;
159 
160  // Enter here only if (iteration_contiguous_ ==
161  // ThreadMap::Iteration::kContiguous)
162  iteration_contiguous_ = 0;
163  ++iteration_strided_;
164 
165  if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
166  return *this;
167  }
168 
169  // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
170  // which means we enter the next tile.
171  iteration_strided_ = 0;
172 
173  return *this;
174  }
175 
179  RegularTileAccessIterator prev(*this);
180  this->operator++();
181 
182  return prev;
183  }
184 
186  CUTLASS_DEVICE
187  void add_tile_offset(TensorCoord const &coord) {
188  add_pointer_offset(coord.contiguous() * Shape::kContiguous +
189  coord.strided() * Shape::kStrided * stride_ *
190  ThreadMap::kElementsPerAccess);
191  }
192 };
193 
195 
203 template <typename Shape_, typename Element_, int AdvanceRank,
204  typename ThreadMap_, int Alignment>
206  Shape_, Element_,
207  layout::ColumnMajor,
208  AdvanceRank, ThreadMap_, Alignment> {
209  public:
211  AdvanceRank == 0 || AdvanceRank == 1,
212  "Specialization for pitch-linear iterator may along advance along the "
213  "contiguous(rank=0) or strided(rank=1) dimension.");
214 
215  using Shape = Shape_;
216  using Element = Element_;
218  static int const kAdvanceRank = AdvanceRank;
219  static int const kAlignment = Alignment;
220 
221  using Index = typename Layout::Index;
222  using LongIndex = typename Layout::LongIndex;
223 
226 
227  using ThreadMap = ThreadMap_;
228 
233  (kAdvanceRank == 0 ? 0 : 1),
234  ThreadMap_>;
235 
236  using AccessType = typename UnderlyingIterator::AccessType;
237 
238  private:
239 
241  UnderlyingIterator iterator_;
242 
243  public:
247  int thread_id
248  )
249  : iterator_({ref.data(), ref.stride()}, thread_id) {}
250 
253  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
254 
257  void add_pointer_offset(LongIndex pointer_offset) {
258  iterator_.add_pointer_offset(pointer_offset);
259  }
260 
263  AccessType *get() const {
264  return reinterpret_cast<AccessType *>(iterator_.get());
265  }
266 
268  CUTLASS_DEVICE
269  void add_tile_offset(TensorCoord const &coord) {
270  iterator_.add_tile_offset({coord.row(), coord.column()});
271  }
272 
276  ++iterator_;
277  return *this;
278  }
279 
283  RegularTileAccessIterator prev(*this);
284  ++iterator_;
285 
286  return prev;
287  }
288 };
289 
290 
292 
300 template <typename Shape_, typename Element_, int AdvanceRank,
301  typename ThreadMap_, int Alignment>
303  Shape_, Element_,
304  layout::RowMajor,
305  AdvanceRank, ThreadMap_, Alignment> {
306  public:
308  AdvanceRank == 0 || AdvanceRank == 1,
309  "Specialization for pitch-linear iterator may along advance along the "
310  "contiguous(rank=0) or strided(rank=1) dimension.");
311 
312  using Shape = Shape_;
313  using Element = Element_;
315  static int const kAdvanceRank = AdvanceRank;
316  static int const kAlignment = Alignment;
317 
318  using Index = typename Layout::Index;
319  using LongIndex = typename Layout::LongIndex;
320 
323 
324  using ThreadMap = ThreadMap_;
325 
330  (kAdvanceRank == 0 ? 1 : 0),
331  ThreadMap_>;
332 
333  using AccessType = typename UnderlyingIterator::AccessType;
334 
335  private:
336 
338  UnderlyingIterator iterator_;
339 
340  public:
344  int thread_id
345  )
346  : iterator_({ref.data(), ref.stride()}, thread_id) {}
347 
350  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
351 
354  void add_pointer_offset(LongIndex pointer_offset) {
355  iterator_.add_pointer_offset(pointer_offset);
356  }
357 
360  AccessType *get() const {
361  return reinterpret_cast<AccessType *>(iterator_.get());
362  }
363 
365  CUTLASS_DEVICE
366  void add_tile_offset(TensorCoord const &coord) {
367  iterator_.add_tile_offset({coord.column(), coord.row()});
368  }
369 
373  ++iterator_;
374  return *this;
375  }
376 
380  RegularTileAccessIterator prev(*this);
381  ++iterator_;
382 
383  return prev;
384  }
385 };
386 
388 
389 } // namespace threadblock
390 } // namespace transform
391 } // namespace cutlass
392 
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:62
Definition: aligned_buffer.h:35
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_access_iterator_pitch_linear.h:187
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
Defines a structure containing strides, bounds, and a pointer to tensor data.
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_access_iterator_pitch_linear.h:366
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_pitch_linear.h:178
Array< Element, ThreadMap::kElementsPerAccess > AccessType
Element type per access.
Definition: regular_tile_access_iterator_pitch_linear.h:84
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_access_iterator_pitch_linear.h:322
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: regular_tile_access_iterator_pitch_linear.h:253
Mapping function for pitch-linear memory.
Definition: pitch_linear.h:163
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:154
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_access_iterator_pitch_linear.h:269
typename UnderlyingIterator::AccessType AccessType
Definition: regular_tile_access_iterator_pitch_linear.h:333
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_pitch_linear.h:354
Mapping function for column-major matrices.
Definition: layout/matrix.h:142
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:59
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694
int64_t LongIndex
Long index type used for offsets.
Definition: pitch_linear.h:175
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_pitch_linear.h:379
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_pitch_linear.h:154
Defines a Shape template for matrix tiles.
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_pitch_linear.h:343
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_pitch_linear.h:372
typename UnderlyingIterator::AccessType AccessType
Definition: regular_tile_access_iterator_pitch_linear.h:236
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301
#define static_assert(__e, __m)
Definition: platform.h:153
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_pitch_linear.h:109
int32_t Index
Index type used for coordinates.
Definition: pitch_linear.h:172
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: regular_tile_access_iterator_pitch_linear.h:125
Templates implementing the address computation of storing of tiles from pitch-linear rank=2 tensors...
Mapping function for row-major matrices.
Definition: layout/matrix.h:50
Defines a canonical coordinate for rank=2 matrices offering named indices.
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_pitch_linear.h:275
Definition: regular_tile_access_iterator.h:48
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_pitch_linear.h:246
Defines layout functions used by TensorRef and derived classes.
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:151
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_pitch_linear.h:282
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_access_iterator_pitch_linear.h:225
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: regular_tile_access_iterator_pitch_linear.h:350
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_pitch_linear.h:132
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_pitch_linear.h:257
Basic include for CUTLASS.
Definition: matrix_coord.h:39