CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
regular_tile_iterator_pitch_linear.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
35 #pragma once
36 
37 #include "cutlass/cutlass.h"
38 #include "cutlass/tensor_ref.h"
39 #include "cutlass/layout/matrix.h"
41 
42 #include "regular_tile_iterator.h"
43 
45 
46 namespace cutlass {
47 namespace transform {
48 namespace threadblock {
49 
51 
53 template <
54  typename Shape_,
55  typename Element_,
56  int AdvanceRank,
57  typename ThreadMap_,
58  int Alignment
59 >
60 class RegularTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment> {
61 public:
62 
63  using Shape = Shape_;
64  using Element = Element_;
66  static int const kAdvanceRank = AdvanceRank;
67  using ThreadMap = ThreadMap_;
68  static int const kAlignment = Alignment;
69 
70  using Index = typename Layout::Index;
71  using LongIndex = typename Layout::LongIndex;
72 
74  using TensorCoord = typename Layout::TensorCoord;
75 
76  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
77 
78  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1,
79  "Advance rank may only be along the contiguous or strided dimensions.");
80 
81 private:
82 
83  //
84  // Types
85  //
86 
88 
89  //
90  // Data members
91  //
92 
94  uint8_t *pointer_;
95 
97  Index stride_;
98 
100  Index increment_strided_;
101 
103  Index increment_advance_;
104 
105 public:
106 
107  CUTLASS_DEVICE
108  RegularTileIterator(): pointer_(nullptr), increment_strided_(0), increment_advance_(0) { }
109 
110  CUTLASS_DEVICE
112  TensorRef const &ref,
113  int thread_idx
114  ):
115  pointer_(reinterpret_cast<uint8_t *>(ref.data()) + (ref.offset(ThreadMap::initial_offset(thread_idx)) * sizeof_bits<Element>::value / 8)) {
116 
117  stride_ = ref.stride()[0];
118  increment_strided_ = (ref.stride()[0] * sizeof_bits<Element>::value) * ThreadMap::Delta::kStrided / 8;
119 
120  increment_advance_ =
121  (kAdvanceRank == 0 ?
122  Shape::kContiguous * sizeof_bits<Element>::value / 8 :
123  Shape::kStrided * (ref.stride()[0] * sizeof_bits<Element>::value / 8));
124  }
125 
127  CUTLASS_DEVICE
128  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
129 
130  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
131  uint8_t const *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
132 
134  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
135 
136  AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_pointer);
137 
139  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
140 
141  int idx = c + s * ThreadMap::Iterations::kContiguous;
142  frag_ptr[idx] = access_ptr[c * ThreadMap::Delta::kContiguous];
143  }
144 
145  if (s + 1 < ThreadMap::Iterations::kStrided) {
146  byte_pointer += increment_strided_;
147  }
148  }
149  }
150 
153  void load(Fragment &frag, TensorCoord const & tile_offset) {
154  load_with_pointer_offset(
155  frag,
156  tile_offset.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
157  tile_offset.strided() * Shape::kStrided * stride_
158  );
159  }
160 
163  void load(Fragment &frag) {
164  load_with_pointer_offset(frag, 0);
165  }
166 
169  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
170 
171  AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
172  uint8_t *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
173 
175  for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
176 
177  AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_pointer);
178 
180  for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
181 
182  int idx = c + s * ThreadMap::Iterations::kContiguous;
183  access_ptr[c * ThreadMap::Delta::kContiguous] = frag_ptr[idx];
184  }
185 
186  if (s + 1 < ThreadMap::Iterations::kStrided) {
187  byte_pointer += increment_strided_;
188  }
189  }
190  }
191 
194  void store(Fragment const &frag, TensorCoord const & tile_offset) {
195  store_with_pointer_offset(
196  frag,
197  tile_offset.contiguous() * Shape::kContiguous + tile_offset.strided() * Shape::kStrided * stride_
198  );
199  }
200 
203  void store(Fragment const &frag) {
204  store_with_pointer_offset(frag, 0);
205  }
206 
210  pointer_ += increment_advance_;
211  return *this;
212  }
213 
217  pointer_ -= increment_advance_;
218  return *this;
219  }
220 
223  void add_pointer_offset(LongIndex pointer_offset) {
224  pointer_ += pointer_offset;
225  }
226 
228  CUTLASS_DEVICE
229  void add_tile_offset(TensorCoord const &coord) {
230  int offset = sizeof_bits<Element>::value *
231  (coord.contiguous() * Shape::kContiguous + coord.strided() * Shape::kStrided * stride_) / 8;
232  add_pointer_offset(offset);
233  }
234 
235 };
236 
238 
240 template <
241  typename Shape_,
242  typename Element_,
243  int AdvanceRank,
244  typename ThreadMap_,
245  int Alignment
246 >
247 class RegularTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, Alignment> {
248 public:
249 
250  using Shape = Shape_;
251  using Element = Element_;
253  static int const kAdvanceRank = AdvanceRank;
254  using ThreadMap = ThreadMap_;
255  static int const kAlignment = Alignment;
256 
257  using Index = typename Layout::Index;
258  using LongIndex = typename Layout::LongIndex;
259 
262 
263  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
264 
267  Element,
269  (kAdvanceRank == 0 ? 1 : 0),
270  ThreadMap,
271  kAlignment
272  >;
273 
274  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1,
275  "Advance rank may only be along the row or column dimensions.");
276 
277 private:
278 
279  Underlying iterator_;
280 
281 public:
282 
283  CUTLASS_DEVICE
285 
286  CUTLASS_DEVICE
288  TensorRef const &ref,
289  int thread_idx
290  ):
291  iterator_({ref.data(), ref.stride()}, thread_idx) {
292 
293  }
294 
297  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
298  iterator_.load_with_pointer_offset(frag, pointer_offset);
299  }
300 
303  void load(Fragment &frag, TensorCoord const & tile_offset) {
304  iterator_.load_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
305  }
306 
309  void load(Fragment &frag) {
310  iterator_.load_with_pointer_offset(frag, 0);
311  }
312 
315  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
316  iterator_.store_with_pointer_offset(frag, pointer_offset);
317  }
318 
321  void store(Fragment const &frag, TensorCoord const & tile_offset) {
322  iterator_.store_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
323  }
324 
327  void store(Fragment const &frag) {
328  iterator_.store_with_pointer_offset(frag, 0);
329  }
330 
334  ++iterator_;
335  return *this;
336  }
337 
341  --iterator_;
342  return *this;
343  }
344 
347  void add_pointer_offset(LongIndex pointer_offset) {
348  iterator_.add_pointer_offset(pointer_offset);
349  }
350 
352  CUTLASS_DEVICE
353  void add_tile_offset(TensorCoord const &coord) {
354  iterator_.add_tile_offset({coord.column(), coord.row()});
355  }
356 
357 };
358 
360 
362 template <
363  typename Shape_,
364  typename Element_,
365  int AdvanceRank,
366  typename ThreadMap_,
367  int Alignment
368 >
369 class RegularTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, Alignment> {
370 public:
371 
372  using Shape = Shape_;
373  using Element = Element_;
375  static int const kAdvanceRank = AdvanceRank;
376  using ThreadMap = ThreadMap_;
377  static int const kAlignment = Alignment;
378 
379  using Index = typename Layout::Index;
380  using LongIndex = typename Layout::LongIndex;
381 
384 
385  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
386 
389  Element,
391  (kAdvanceRank == 0 ? 0 : 1),
392  ThreadMap
393  >;
394 
395  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1,
396  "Advance rank may only be along the row or column dimensions.");
397 
398 private:
399 
400  Underlying iterator_;
401 
402 public:
403 
404  CUTLASS_DEVICE
406 
407  CUTLASS_DEVICE
409  TensorRef const &ref,
410  int thread_idx
411  ):
412  iterator_({ref.data(), ref.stride()}, thread_idx) {
413 
414  }
415 
418  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
419  iterator_.load_with_pointer_offset(frag, pointer_offset);
420  }
421 
424  void load(Fragment &frag, TensorCoord const & tile_offset) {
425  iterator_.load_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
426  }
427 
430  void load(Fragment &frag) {
431  iterator_.load_with_pointer_offset(frag, 0);
432  }
433 
436  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
437  iterator_.store_with_pointer_offset(frag, pointer_offset);
438  }
439 
442  void store(Fragment const &frag, TensorCoord const & tile_offset) {
443  iterator_.store_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
444  }
445 
448  void store(Fragment const &frag) {
449  iterator_.store_with_pointer_offset(frag, 0);
450  }
451 
455  ++iterator_;
456  return *this;
457  }
458 
462  --iterator_;
463  return *this;
464  }
465 
468  void add_pointer_offset(LongIndex pointer_offset) {
469  iterator_.add_pointer_offset(pointer_offset);
470  }
471 
473  CUTLASS_DEVICE
474  void add_tile_offset(TensorCoord const &coord) {
475  iterator_.add_tile_offset({coord.row(), coord.column()});
476  }
477 
478 };
479 
481 
482 } // namespace threadblock
483 } // namespace transform
484 } // namespace cutlass
485 
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:62
Definition: aligned_buffer.h:35
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_pitch_linear.h:468
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
Defines a structure containing strides, bounds, and a pointer to tensor data.
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear.h:321
CUTLASS_HOST_DEVICE RegularTileIterator & operator--()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear.h:216
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_pitch_linear.h:223
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254
Mapping function for pitch-linear memory.
Definition: pitch_linear.h:163
CUTLASS_DEVICE RegularTileIterator(TensorRef const &ref, int thread_idx)
Definition: regular_tile_iterator_pitch_linear.h:111
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:154
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_pitch_linear.h:474
Aligned array type.
Definition: array.h:511
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear.h:424
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear.h:333
CUTLASS_HOST_DEVICE void load(Fragment &frag)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear.h:430
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear.h:303
Mapping function for column-major matrices.
Definition: layout/matrix.h:142
CUTLASS_DEVICE RegularTileIterator(TensorRef const &ref, int thread_idx)
Definition: regular_tile_iterator_pitch_linear.h:408
Array< Element, ThreadMap::Iterations::kCount *ThreadMap::kElementsPerAccess > Fragment
Definition: regular_tile_iterator_pitch_linear.h:385
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear.h:418
Array< Element, ThreadMap::Iterations::kCount *ThreadMap::kElementsPerAccess > Fragment
Definition: regular_tile_iterator_pitch_linear.h:76
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear.h:448
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear.h:169
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear.h:297
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:59
CUTLASS_DEVICE RegularTileIterator(TensorRef const &ref, int thread_idx)
Definition: regular_tile_iterator_pitch_linear.h:287
int64_t LongIndex
Long index type used for offsets.
Definition: pitch_linear.h:175
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_pitch_linear.h:353
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear.h:454
Defines the size of an element in bits.
Definition: numeric_types.h:42
CUTLASS_HOST_DEVICE void load(Fragment &frag)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear.h:163
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear.h:153
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear.h:209
#define nullptr
nullptr
Definition: platform.h:144
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear.h:194
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear.h:436
CUTLASS_HOST_DEVICE RegularTileIterator & operator--()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear.h:340
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear.h:442
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear.h:327
Definition: regular_tile_iterator.h:50
#define static_assert(__e, __m)
Definition: platform.h:153
int32_t Index
Index type used for coordinates.
Definition: pitch_linear.h:172
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_pitch_linear.h:229
Mapping function for row-major matrices.
Definition: layout/matrix.h:50
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear.h:315
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear.h:203
Templates implementing storing of tiles from pitch-linear rank=2 tensors.
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_pitch_linear.h:347
Defines layout functions used by TensorRef and derived classes.
CUTLASS_HOST_DEVICE void load(Fragment &frag)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear.h:309
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:151
CUTLASS_HOST_DEVICE RegularTileIterator & operator--()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear.h:461
Basic include for CUTLASS.
Definition: matrix_coord.h:39
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear.h:128
Array< Element, ThreadMap::Iterations::kCount *ThreadMap::kElementsPerAccess > Fragment
Definition: regular_tile_iterator_pitch_linear.h:263