CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
tile_iterator_volta_tensor_op.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/array.h"
32 #include "cutlass/layout/matrix.h"
34 
36 
38 
39 namespace cutlass {
40 namespace epilogue {
41 namespace warp {
42 
44 
46 template <
47  typename WarpShape,
48  typename InterleavedTileShape,
49  typename ElementC,
50  typename Layout
51 >
53 
55 
57 template <
58  typename WarpShape_
59 >
60 class TileIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, half_t, layout::RowMajor> {
61 public:
62 
63  using WarpShape = WarpShape_;
64  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
65  using Element = half_t;
67 
70  using Index = typename TensorRef::Index;
71  using LongIndex = typename TensorRef::LongIndex;
72 
74 
76  using Shape = MatrixShape<
77  Policy::kRowsPerIteration,
78  WarpShape::kN
79  >;
80 
82  using AccessType = typename Policy::AccessType;
83 
85  using Fragment = typename Policy::Fragment;
86 
88  using AccumulatorTile = typename Policy::AccumulatorTile;
89 
91  static int const kIterations = Policy::kIterations;
92 
94  static int const kElementsPerAccess = Policy::kElementsPerAccess;
95 
96  // Internal constants
97  struct Detail {
98  static int const kLanesInQuad = 4;
99  static int const kRowsPerQuad = 4;
100  static int const kColumnsPerQuad = 8;
101  static int const kAccessesPerQuad = kColumnsPerQuad / Policy::kElementsPerAccess;
102  static int const kAccessQuadDelta = 16;
103  };
104 
106  using Padding = MatrixShape<
107  0,
108  Policy::kElementsPerAccess>;
109 
110 private:
111 
112  //
113  // Data members
114  //
115 
117  AccessType *pointer_;
118 
120  Layout layout_;
121 
122 public:
123 
127 
129  CUTLASS_DEVICE
131  TensorRef const &ref,
132  unsigned lane_id
133  ):
134  pointer_(reinterpret_cast<AccessType *>(ref.data())),
135  layout_(ref.stride()[0] / Policy::kElementsPerAccess) {
136 
137  int quad_id = lane_id / Detail::kLanesInQuad;
138  int lane_in_quad = (lane_id % Detail::kLanesInQuad);
139 
140  int quad_row_idx = ((quad_id & 4) >> 1) + (quad_id & 1);
141  int quad_col_idx = ((quad_id & 2) >> 1);
142 
143  int row = quad_row_idx * Detail::kRowsPerQuad + lane_in_quad;
144  int column = quad_col_idx * Detail::kColumnsPerQuad;
145 
146  pointer_ += layout_({row, column / kElementsPerAccess});
147  }
148 
152  pointer_ += pointer_offset / Policy::kElementsPerAccess;
153  return *this;
154  }
155 
159 
160  pointer_ += layout_({
161  tile_offset.row() * Shape::kRow,
162  tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess});
163 
164  return *this;
165  }
166 
170  add_tile_offset(tile_offset);
171  return *this;
172  }
173 
175  CUTLASS_DEVICE
176  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
177 
178  AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
179 
181  for (int tile_idx = 0; tile_idx < Policy::TileIterations::kColumn; ++tile_idx) {
182 
184  for (int access_idx = 0; access_idx < Policy::kAccessesPerInterleavedTile; ++access_idx) {
185 
186  int access_quad = access_idx / 2;
187  int access = access_idx % 2;
188 
189  int ptr_offset = tile_idx * InterleavedTileShape::kN / Policy::kElementsPerAccess +
190  access_quad * Detail::kAccessQuadDelta / Policy::kElementsPerAccess + access;
191 
192  int frag_idx = tile_idx * Policy::kAccessesPerInterleavedTile + access_idx;
193 
194  AccessType access_vector = frag_ptr[frag_idx];
195 
196  pointer_[ptr_offset] = access_vector;
197  }
198  }
199  }
200 
203  void store(Fragment const &frag) {
204  store_with_pointer_offset(frag, 0);
205  }
206 
209  void load_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
210 
211  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
212 
214  for (int tile_idx = 0; tile_idx < Policy::TileIterations::kColumn; ++tile_idx) {
215 
217  for (int access_idx = 0; access_idx < Policy::kAccessesPerInterleavedTile; ++access_idx) {
218 
219  int access_quad = access_idx / 2;
220  int access = access_idx % 2;
221 
222  int ptr_offset = tile_idx * Detail::kTileDelta + access_quad * Detail::kAccessQuadDelta + access;
223  int frag_idx = tile_idx * Policy::kAccessesPerInterleavedTile + access_idx;
224 
225  frag_ptr[frag_idx] = pointer_[ptr_offset];
226  }
227  }
228  }
229 
232  void load(Fragment const &frag) {
233  load_with_pointer_offset(frag, 0);
234  }
235 };
236 
238 
240 template <
241  typename WarpShape_
242 >
243 class TileIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, float, layout::RowMajor> {
244 public:
245 
246  using WarpShape = WarpShape_;
247  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
248  using Element = float;
250 
253  using Index = typename TensorRef::Index;
254  using LongIndex = typename TensorRef::LongIndex;
255 
257 
259  using Shape = MatrixShape<
260  Policy::kRowsPerIteration,
261  WarpShape::kN
262  >;
263 
265  using AccessType = typename Policy::AccessType;
266 
268  using Fragment = typename Policy::Fragment;
269 
271  using AccumulatorTile = typename Policy::AccumulatorTile;
272 
274  static int const kIterations = Policy::kIterations;
275 
277  static int const kElementsPerAccess = Policy::kElementsPerAccess;
278 
279  // Internal constants
280  struct Detail {
281  static int const kLanesInQuad = 4;
282  static int const kRowsPerQuad = 4;
283  static int const kColumnsPerQuad = 8;
284  static int const kAccessesPerQuad = kColumnsPerQuad / Policy::kElementsPerAccess;
285  static int const kAccessQuadDelta = 16;
286  };
287 
289  using Padding = MatrixShape<
290  0,
291  Policy::kElementsPerAccess>;
292 
293 private:
294 
295  //
296  // Data members
297  //
298 
300  AccessType *pointer_;
301 
303  Layout layout_;
304 
305 public:
306 
310 
312  CUTLASS_DEVICE
314  TensorRef const &ref,
315  unsigned lane_id
316  ):
317  pointer_(reinterpret_cast<AccessType *>(ref.data())),
318  layout_(ref.stride()[0] / Policy::kElementsPerAccess) {
319 
320  int quad_id = lane_id / Detail::kLanesInQuad;
321  int lane_in_quad = (lane_id % Detail::kLanesInQuad);
322 
323  int const kQuadRowDelta = 4;
324  int const kQuadColumnDelta = 2 * Policy::MmaIterations::kColumn;
325 
326  int quad_row_offset = ((quad_id & 4) / 2 + (quad_id & 1)) * kQuadRowDelta;
327  int quad_column_offset = (quad_id & 2) / 2 * kQuadColumnDelta;
328 
329  int thread_row_offset = (lane_in_quad & 1);
330  int thread_column_offset = (lane_in_quad & 2) / 2;
331 
332  int row = quad_row_offset + thread_row_offset;
333  int column = quad_column_offset + thread_column_offset;
334 
335  pointer_ += layout_({row, column});
336  }
337 
341  pointer_ += pointer_offset / Policy::kElementsPerAccess;
342  return *this;
343  }
344 
348 
349  pointer_ += layout_({
350  tile_offset.row() * Shape::kRow,
351  tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess});
352 
353  return *this;
354  }
355 
359  add_tile_offset(tile_offset);
360  return *this;
361  }
362 
364  CUTLASS_DEVICE
365  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
366 
367  AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
368 
369  int const kAccessesPerRow = Policy::TileIterations::kColumn * Policy::MmaIterations::kColumn * 2;
370 
372  for (int row_idx = 0; row_idx < Policy::kRowsPerMmaTile; ++row_idx) {
373 
375  for (int access_idx = 0; access_idx < kAccessesPerRow; ++access_idx) {
376 
377  int frag_idx = row_idx * kAccessesPerRow + access_idx;
378 
379  int ptr_column_offset = (access_idx & 1) * 2 +
380  (access_idx & 2) * Policy::MmaIterations::kColumn * 2 +
381  (access_idx & 4) * Policy::MmaIterations::kColumn * 2;
382 
383  int ptr_row_offset = row_idx * 2;
384 
385  int ptr_offset = layout_({ptr_row_offset, ptr_column_offset});
386 
387  pointer_[ptr_offset] = frag_ptr[frag_idx];
388  }
389  }
390  }
391 
394  void store(Fragment const &frag) {
395  store_with_pointer_offset(frag, 0);
396  }
397 
400  void load_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
401 
402  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
403 
404  assert(0); // TODO
405  }
406 
409  void load(Fragment const &frag) {
410  load_with_pointer_offset(frag, 0);
411  }
412 };
413 
415 
416 } // namespace warp
417 } // namespace epilogue
418 } // namespace cutlass
419 
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Load.
Definition: tile_iterator_volta_tensor_op.h:400
Describes the size of a matrix tile.
Definition: matrix_shape.h:42
CUTLASS_HOST_DEVICE Index const & column() const
Returns the column of the coordinate.
Definition: matrix_coord.h:85
Definition: aligned_buffer.h:35
Defines basic structures needed for implementing the warp-scoped phase of the epilogue. These quantities assume a &#39;column-major&#39; arrangement of TensorOp instructions, of which a row-oriented slice is visible per iteration.
CUTLASS_HOST_DEVICE void load(Fragment const &frag)
Load.
Definition: tile_iterator_volta_tensor_op.h:232
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store.
Definition: tile_iterator_volta_tensor_op.h:365
IEEE half-precision floating-point type.
Definition: half.h:126
CUTLASS_HOST_DEVICE Index const & row() const
Returns the row of the coordinate.
Definition: matrix_coord.h:77
typename Policy::AccessType AccessType
Array type for aligned memory accesses.
Definition: tile_iterator_volta_tensor_op.h:265
CUTLASS_HOST_DEVICE TileIteratorVoltaTensorOp & add_tile_offset(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: tile_iterator_volta_tensor_op.h:158
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE TileIteratorVoltaTensorOp & operator+=(TensorCoord const &tile_offset)
Definition: tile_iterator_volta_tensor_op.h:358
CUTLASS_HOST_DEVICE void load(Fragment const &frag)
Load.
Definition: tile_iterator_volta_tensor_op.h:409
CUTLASS_HOST_DEVICE TileIteratorVoltaTensorOp & operator+=(TensorCoord const &tile_offset)
Definition: tile_iterator_volta_tensor_op.h:169
typename Policy::Fragment Fragment
This is the fragment size produced by one access of the iterator.
Definition: tile_iterator_volta_tensor_op.h:268
CUTLASS_HOST_DEVICE TileIteratorVoltaTensorOp & add_pointer_offset(Index pointer_offset)
Adds a pointer offset.
Definition: tile_iterator_volta_tensor_op.h:151
Template for reading and writing tiles of accumulators to shared memory.
Definition: tile_iterator_volta_tensor_op.h:52
#define nullptr
nullptr
Definition: platform.h:144
CUTLASS_HOST_DEVICE TileIteratorVoltaTensorOp()
Default constructor.
Definition: tile_iterator_volta_tensor_op.h:309
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Store.
Definition: tile_iterator_volta_tensor_op.h:203
CUTLASS_HOST_DEVICE TileIteratorVoltaTensorOp & add_tile_offset(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: tile_iterator_volta_tensor_op.h:347
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store.
Definition: tile_iterator_volta_tensor_op.h:176
typename Policy::AccumulatorTile AccumulatorTile
This is the complete warp-level accumulator tile.
Definition: tile_iterator_volta_tensor_op.h:271
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57
typename Policy::AccessType AccessType
Array type for aligned memory accesses.
Definition: tile_iterator_volta_tensor_op.h:82
typename Layout::Index Index
Index type.
Definition: tensor_ref.h:165
Mapping function for row-major matrices.
Definition: layout/matrix.h:50
CUTLASS_DEVICE TileIteratorVoltaTensorOp(TensorRef const &ref, unsigned lane_id)
Constructor from TensorRef.
Definition: tile_iterator_volta_tensor_op.h:130
typename Policy::Fragment Fragment
This is the fragment size produced by one access of the iterator.
Definition: tile_iterator_volta_tensor_op.h:85
Defines layout functions used by TensorRef and derived classes.
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
Policy details related to the epilogue.
Definition: volta_tensor_op_policy.h:52
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Store.
Definition: tile_iterator_volta_tensor_op.h:394
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Load.
Definition: tile_iterator_volta_tensor_op.h:209
CUTLASS_HOST_DEVICE TileIteratorVoltaTensorOp & add_pointer_offset(Index pointer_offset)
Adds a pointer offset.
Definition: tile_iterator_volta_tensor_op.h:340
CUTLASS_DEVICE TileIteratorVoltaTensorOp(TensorRef const &ref, unsigned lane_id)
Constructor from TensorRef.
Definition: tile_iterator_volta_tensor_op.h:313
Definition: matrix_coord.h:39
typename Policy::AccumulatorTile AccumulatorTile
This is the complete warp-level accumulator tile.
Definition: tile_iterator_volta_tensor_op.h:88
CUTLASS_HOST_DEVICE TileIteratorVoltaTensorOp()
Default constructor.
Definition: tile_iterator_volta_tensor_op.h:126
typename Layout::LongIndex LongIndex
Long index used for pointer offsets.
Definition: tensor_ref.h:168