CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
regular_tile_iterator_pitch_linear_2dthreadtile.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
35 #pragma once
36 
37 #include "cutlass/cutlass.h"
38 #include "cutlass/tensor_ref.h"
39 #include "cutlass/layout/matrix.h"
41 
42 #include "regular_tile_iterator.h"
43 
45 
46 namespace cutlass {
47 namespace transform {
48 namespace threadblock {
49 
51 template <
52  typename Shape,
53  typename Element,
54  typename Layout,
55  int AdvanceRank,
56  typename ThreadMap,
57  int Alignment = sizeof_bits<Element>::value * ThreadMap::kElementsPerAccess / 8
58 >
60 
61 
63 template <
64  typename Shape_,
65  typename Element_,
66  int AdvanceRank,
67  typename ThreadMap_,
68  int Alignment
69 >
70 class RegularTileIterator2dThreadTile<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment> {
71 public:
72 
73  using Shape = Shape_;
74  using Element = Element_;
75  using Layout = layout::PitchLinear;
76  static int const kAdvanceRank = AdvanceRank;
77  using ThreadMap = ThreadMap_;
78  static int const kAlignment = Alignment;
79 
80  using Index = typename Layout::Index;
81  using LongIndex = typename Layout::LongIndex;
82 
84  using TensorCoord = typename Layout::TensorCoord;
85 
86  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
87 
88  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1,
89  "Advance rank may only be along the contiguous or strided dimensions.");
90 
91 private:
92 
93  //
94  // Types
95  //
96 
98 
99  //
100  // Data members
101  //
102 
104  uint8_t *pointer_;
105 
107  Index stride_;
108 
110  Index increment_strided_;
111 
113  Index increment_advance_;
114 
115 public:
116 
117  CUTLASS_DEVICE
118  RegularTileIterator2dThreadTile(): pointer_(nullptr), increment_strided_(0), increment_advance_(0) { }
119 
120  CUTLASS_DEVICE
122  TensorRef const &ref,
123  int thread_idx,
124  int interleave
125  ){
126 
127  TensorCoord t = ThreadMap::initial_offset(thread_idx);
128  long int offset = t[0] * interleave + t[1] * ref.stride()[0]/interleave;
129  pointer_ = reinterpret_cast<uint8_t *>(ref.data() + offset);
130 
131  stride_ = ref.stride()[0] / interleave;
132  increment_strided_ = (ref.stride()[0] * sizeof_bits<Element>::value / 8) * ThreadMap::Delta::kStrided / interleave;
133 
134  increment_advance_ =
135  (kAdvanceRank == 0 ?
136  Shape::kContiguous * sizeof_bits<Element>::value / 8 :
137  Shape::kStrided * (ref.stride()[0] * sizeof_bits<Element>::value / 8) / interleave);
138  }
139 
141  CUTLASS_DEVICE
142  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
143 
144  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
145  uint8_t const *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
146 
148  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
149 
150  AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_pointer);
151 
153  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
154 
155  int idx = c + s * ThreadMap::Iterations::kContiguous;
156  frag_ptr[idx] = access_ptr[c * ThreadMap::Delta::kContiguous / ThreadMap::ThreadAccessShape::kStrided];
157  }
158 
159  if (s + 1 < ThreadMap::Iterations::kStrided) {
160  byte_pointer += increment_strided_;
161  }
162  }
163  }
164 
167  void load(Fragment &frag, TensorCoord const & tile_offset) {
168  load_with_pointer_offset(
169  frag,
170  tile_offset.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
171  tile_offset.strided() * Shape::kStrided * stride_
172  );
173  }
174 
177  void load(Fragment &frag) {
178  load_with_pointer_offset(frag, 0);
179  }
180 
183  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
184 
185  AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
186  uint8_t *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
187 
189  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
190 
191  AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_pointer);
192 
194  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
195 
196  int idx = c + s * ThreadMap::Iterations::kContiguous;
197  access_ptr[c * ThreadMap::Delta::kContiguous / ThreadMap::ThreadAccessShape::kStrided] = frag_ptr[idx];
198  }
199 
200  if (s + 1 < ThreadMap::Iterations::kStrided) {
201  byte_pointer += increment_strided_;
202  }
203  }
204  }
205 
208  void store(Fragment const &frag, TensorCoord const & tile_offset) {
209  store_with_pointer_offset(
210  frag,
211  tile_offset.contiguous() * Shape::kContiguous + tile_offset.strided() * Shape::kStrided * stride_
212  );
213  }
214 
217  void store(Fragment const &frag) {
218  store_with_pointer_offset(frag, 0);
219  }
220 
224  pointer_ += increment_advance_;
225  return *this;
226  }
227 
231  pointer_ -= increment_advance_;
232  return *this;
233  }
234 
237  void add_pointer_offset(LongIndex pointer_offset) {
238  pointer_ += pointer_offset;
239  }
240 
242  CUTLASS_DEVICE
243  void add_tile_offset(TensorCoord const &coord) {
244  int offset = sizeof_bits<Element>::value *
245  (coord.contiguous() * Shape::kContiguous + coord.strided() * Shape::kStrided * stride_) / 8;
246  add_pointer_offset(offset);
247  }
248 
249 };
250 
252 
254 template <
255  typename Shape_,
256  typename Element_,
257  int AdvanceRank,
258  typename ThreadMap_,
259  int Alignment
260 >
261 class RegularTileIterator2dThreadTile<Shape_, Element_, layout::RowMajorInterleaved<4>, AdvanceRank, ThreadMap_, Alignment> {
262 public:
263 
264  using Shape = Shape_;
265  using Element = Element_;
267  static int const kAdvanceRank = AdvanceRank;
268  using ThreadMap = ThreadMap_;
269  static int const kAlignment = Alignment;
270 
271  using Index = typename Layout::Index;
272  using LongIndex = typename Layout::LongIndex;
273 
276 
277  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
278 
281  Element,
283  (kAdvanceRank == 0 ? 1 : 0),
284  ThreadMap,
285  kAlignment
286  >;
287 
288  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1,
289  "Advance rank may only be along the row or column dimensions.");
290 
291 private:
292 
293  Underlying iterator_;
294 
295 public:
296 
297  CUTLASS_DEVICE
299 
300  CUTLASS_DEVICE
302  TensorRef const &ref,
303  int thread_idx
304  ):
305  iterator_({ref.data(), ref.stride()}, thread_idx, 4) {
306 
307  }
308 
311  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
312  iterator_.load_with_pointer_offset(frag, pointer_offset);
313  }
314 
317  void load(Fragment &frag, TensorCoord const & tile_offset) {
318  iterator_.load_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
319  }
320 
323  void load(Fragment &frag) {
324  iterator_.load_with_pointer_offset(frag, 0);
325  }
326 
329  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
330  iterator_.store_with_pointer_offset(frag, pointer_offset);
331  }
332 
335  void store(Fragment const &frag, TensorCoord const & tile_offset) {
336  iterator_.store_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
337  }
338 
341  void store(Fragment const &frag) {
342  iterator_.store_with_pointer_offset(frag, 0);
343  }
344 
348  ++iterator_;
349  return *this;
350  }
351 
355  --iterator_;
356  return *this;
357  }
358 
361  void add_pointer_offset(LongIndex pointer_offset) {
362  iterator_.add_pointer_offset(pointer_offset);
363  }
364 
366  CUTLASS_DEVICE
367  void add_tile_offset(TensorCoord const &coord) {
368  iterator_.add_tile_offset({coord.column(), coord.row()});
369  }
370 
371 };
372 
374 
376 template <
377  typename Shape_,
378  typename Element_,
379  int AdvanceRank,
380  typename ThreadMap_,
381  int Alignment
382 >
383 class RegularTileIterator2dThreadTile<Shape_, Element_, layout::ColumnMajorInterleaved<4>, AdvanceRank, ThreadMap_, Alignment> {
384 public:
385 
386  using Shape = Shape_;
387  using Element = Element_;
389  static int const kAdvanceRank = AdvanceRank;
390  using ThreadMap = ThreadMap_;
391  static int const kAlignment = Alignment;
392 
393  using Index = typename Layout::Index;
394  using LongIndex = typename Layout::LongIndex;
395 
398 
399  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
401  ThreadMap::kThreads, ThreadMap::ThreadAccessShape::kCount >;
402 
403 
406  Element,
408  (kAdvanceRank == 0 ? 0 : 1),
409  ThreadMap
410  >;
411 
412  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1,
413  "Advance rank may only be along the row or column dimensions.");
414 
415 private:
416 
417  Underlying iterator_;
418 
419 public:
420 
421  CUTLASS_DEVICE
423 
424  CUTLASS_DEVICE
426  TensorRef const &ref,
427  int thread_idx
428  ):
429  iterator_({ref.data(), ref.stride()}, thread_idx, 4) {
430 
431  }
432 
435  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
436  iterator_.load_with_pointer_offset(frag, pointer_offset);
437  }
438 
441  void load(Fragment &frag, TensorCoord const & tile_offset) {
442  iterator_.load_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
443  }
444 
447  void load(Fragment &frag) {
448  iterator_.load_with_pointer_offset(frag, 0);
449  }
450 
453  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
454  iterator_.store_with_pointer_offset(frag, pointer_offset);
455  }
456 
459  void store(Fragment const &frag, TensorCoord const & tile_offset) {
460  iterator_.store_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
461  }
462 
465  void store(Fragment const &frag) {
466  iterator_.store_with_pointer_offset(frag, 0);
467  }
468 
472  ++iterator_;
473  return *this;
474  }
475 
479  --iterator_;
480  return *this;
481  }
482 
485  void add_pointer_offset(LongIndex pointer_offset) {
486  iterator_.add_pointer_offset(pointer_offset);
487  }
488 
490  CUTLASS_DEVICE
491  void add_tile_offset(TensorCoord const &coord) {
492  iterator_.add_tile_offset({coord.row(), coord.column()});
493  }
494 
495 };
496 
498 
499 } // namespace threadblock
500 } // namespace transform
501 } // namespace cutlass
502 
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:441
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:355
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:459
Definition: aligned_buffer.h:35
CUTLASS_DEVICE RegularTileIterator2dThreadTile(TensorRef const &ref, int thread_idx, int interleave)
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:121
CUTLASS_HOST_DEVICE RegularTileIterator2dThreadTile & operator--()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:230
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
static int const value
Definition: numeric_types.h:43
Defines a structure containing strides, bounds, and a pointer to tensor data.
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:183
Array< Element, ThreadMap::Iterations::kCount *ThreadMap::ThreadAccessShape::kCount > Fragment
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:277
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:249
Mapping function for pitch-linear memory.
Definition: pitch_linear.h:163
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:367
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:352
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:329
CUTLASS_HOST_DEVICE void load(Fragment &frag)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:177
Aligned array type.
Definition: array.h:511
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:435
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:84
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:246
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:341
CUTLASS_HOST_DEVICE RegularTileIterator2dThreadTile & operator--()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:354
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:311
CUTLASS_HOST_DEVICE void load(Fragment &frag)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:447
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
int64_t LongIndex
Long index type used for offsets.
Definition: pitch_linear.h:175
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277
Defines the size of an element in bits.
Definition: numeric_types.h:42
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:243
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:453
#define nullptr
nullptr
Definition: platform.h:144
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:317
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:237
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:335
CUTLASS_DEVICE RegularTileIterator2dThreadTile(TensorRef const &ref, int thread_idx)
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:301
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:485
CUTLASS_HOST_DEVICE RegularTileIterator2dThreadTile & operator++()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:347
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
#define static_assert(__e, __m)
Definition: platform.h:153
int32_t Index
Index type used for coordinates.
Definition: pitch_linear.h:172
CUTLASS_DEVICE RegularTileIterator2dThreadTile(TensorRef const &ref, int thread_idx)
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:425
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:465
CUTLASS_HOST_DEVICE RegularTileIterator2dThreadTile & operator++()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:223
Array< Element, ThreadMap::Iterations::kCount *ThreadMap::ThreadAccessShape::kCount > Fragment
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:86
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:217
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:142
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:361
Templates implementing storing of tiles from pitch-linear rank=2 tensors.
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:208
Defines layout functions used by TensorRef and derived classes.
CUTLASS_HOST_DEVICE RegularTileIterator2dThreadTile & operator++()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:471
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
CUTLASS_HOST_DEVICE void load(Fragment &frag)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:323
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:167
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:59
Basic include for CUTLASS.
Definition: matrix_coord.h:39
Definition: pitch_linear_thread_map.h:59
Array< Element, ThreadMap::Iterations::kCount *ThreadMap::ThreadAccessShape::kCount > Fragment
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:399
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:491
CUTLASS_HOST_DEVICE RegularTileIterator2dThreadTile & operator--()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:478