CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
tile_iterator_simt.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/array.h"
32 #include "cutlass/layout/matrix.h"
34 
36 
37 #define CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES 1
38 
40 
41 namespace cutlass {
42 namespace epilogue {
43 namespace warp {
44 
46 
48 template <
49  typename WarpShape,
50  typename Operator,
51  typename Element,
52  typename Layout,
53  typename MmaSimtPolicy
54 >
56 
58 
60 template <
61  typename WarpShape_,
62  typename Operator_,
63  typename Element_,
64  typename MmaSimtPolicy_
65 >
66 class TileIteratorSimt<WarpShape_, Operator_, Element_, layout::RowMajor, MmaSimtPolicy_> {
67 public:
68 
69  using WarpShape = WarpShape_;
70  using Operator = Operator_;
71  using Element = Element_;
72  using Layout = layout::RowMajor;
73 
76  using Index = typename TensorRef::Index;
77  using LongIndex = typename TensorRef::LongIndex;
78 
80 
82  using Shape = MatrixShape<
83  Policy::kRowsPerIteration,
84  WarpShape::kN
85  >;
86 
88  using Fragment = Array<
89  typename Operator::ElementC,
90  Policy::kElementsPerIteration>;
91 
93  using AccumulatorTile = Array<
94  typename Operator::ElementC,
95  Policy::kAccumulatorElementCount>;
96 
98  static int const kIterations = Policy::kIterations;
99 
101  using Padding = MatrixShape<
102  0,
103  4 * Policy::kElementsPerAccess>;
104 
105 private:
106 
109 
110  //
111  // Data members
112  //
113 
115  AccessType *pointer_;
116 
118  Layout layout_;
119 
120 public:
121 
124  TileIteratorSimt(): pointer_(nullptr) { }
125 
129  TensorRef const &ref,
130  unsigned lane_id
131  ):
132  pointer_(reinterpret_cast<AccessType *>(ref.data())),
133  layout_(ref.stride()[0] / Policy::kElementsPerAccess) {
134 
135  auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
136  MatrixCoord lane_offset = lane_layout.inverse(lane_id);
137 
138  pointer_ += layout_(lane_offset);
139  }
140 
144  pointer_ += pointer_offset / Policy::kElementsPerAccess;
145  return *this;
146  }
147 
151 
152  pointer_ += layout_({
153  tile_offset.row() * Shape::kRow,
154  (tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess)
155  });
156 
157  return *this;
158  }
159 
162  TileIteratorSimt & operator+=(TensorCoord const &tile_offset) {
163 
164  add_tile_offset(tile_offset);
165 
166  return *this;
167  }
168 
171  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
172 #if CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES
173  // de-vectorized stores
174  using ScalarAccessType = AlignedArray<Element, 1>;
175  ScalarAccessType const *scalarFragPtr = reinterpret_cast<ScalarAccessType const *>(&frag);
176  ScalarAccessType *scalarPointer = reinterpret_cast<ScalarAccessType *>(pointer_);
177 
179  for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
181  for (int s = 0; s < Policy::kElementsPerAccess; s++) {
182  scalarPointer[n * Policy::MmaSimtPolicy::WarpShape::kColumn * Policy::kElementsPerAccess + s] = scalarFragPtr[n * Policy::kElementsPerAccess + s];
183  }
184  }
185 #else
186  // original vector stores
187  AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
189  for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
190  pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn] = frag_ptr[n];
191  }
192 #endif
193  }
194 
197  void store(Fragment const &frag) {
198  store_with_pointer_offset(frag, 0);
199  }
200 
203  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
204 
205  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
206 
208  for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
209  frag_ptr[n] = pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn];
210  }
211  }
212 
215  void load(Fragment &frag) const {
216  load_with_pointer_offset(frag, 0);
217  }
218 };
219 
221 
222 } // namespace warp
223 } // namespace epilogue
224 } // namespace cutlass
225 
Describes the size of a matrix tile.
Definition: matrix_shape.h:42
CUTLASS_HOST_DEVICE Index const & column() const
Returns the column of the coordinate.
Definition: matrix_coord.h:85
Definition: aligned_buffer.h:35
Array< typename Operator::ElementC, Policy::kAccumulatorElementCount > AccumulatorTile
This is the complete warp-level accumulator tile.
Definition: tile_iterator_simt.h:95
Aligned array type.
Definition: array.h:511
Array< typename Operator::ElementC, Policy::kElementsPerIteration > Fragment
This is the fragment size produced by one access of the iterator.
Definition: tile_iterator_simt.h:90
CUTLASS_HOST_DEVICE Index const & row() const
Returns the row of the coordinate.
Definition: matrix_coord.h:77
Definition: simt_policy.h:50
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Load.
Definition: tile_iterator_simt.h:215
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Store.
Definition: tile_iterator_simt.h:197
CUTLASS_HOST_DEVICE TileIteratorSimt & add_tile_offset(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: tile_iterator_simt.h:150
#define nullptr
nullptr
Definition: platform.h:144
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store.
Definition: tile_iterator_simt.h:171
CUTLASS_HOST_DEVICE TileIteratorSimt & add_pointer_offset(Index pointer_offset)
Adds a pointer offset.
Definition: tile_iterator_simt.h:143
CUTLASS_HOST_DEVICE TileIteratorSimt(TensorRef const &ref, unsigned lane_id)
Constructor from TensorRef.
Definition: tile_iterator_simt.h:128
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_HOST_DEVICE TileIteratorSimt()
Default constructor.
Definition: tile_iterator_simt.h:124
typename Layout::Index Index
Index type.
Definition: tensor_ref.h:165
Mapping function for row-major matrices.
Definition: layout/matrix.h:50
Defines layout functions used by TensorRef and derived classes.
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
CUTLASS_HOST_DEVICE TileIteratorSimt & operator+=(TensorCoord const &tile_offset)
Definition: tile_iterator_simt.h:162
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Load.
Definition: tile_iterator_simt.h:203
Defines basic structures needed for implementing the warp-scoped phase of the epilogue. These quantities assume a &#39;column-major&#39; arrangement of SimtOp instructions, of which a row-oriented slice is visible per iteration.
Definition: matrix_coord.h:39
Template for reading and writing tiles of accumulators to shared memory.
Definition: tile_iterator_simt.h:55
typename Layout::LongIndex LongIndex
Long index used for pointer offsets.
Definition: tensor_ref.h:168