CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
fragment_iterator_tensor_op.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
38 #pragma once
39 
40 #include "cutlass/array.h"
41 #include "cutlass/layout/matrix.h"
42 
44 
46 
47 namespace cutlass {
48 namespace epilogue {
49 namespace warp {
50 
52 
54 template <
55  typename WarpShape,
56  typename OperatorShape,
57  typename OperatorElementC,
58  typename OperatorFragmentC,
59  typename Layout
60 >
62 
64 
66 template <
67  typename WarpShape_,
68  typename OperatorShape_,
69  typename OperatorElementC_,
70  typename OperatorFragmentC_
71 >
72 class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
73 public:
74 
75  using WarpShape = WarpShape_;
76  using OperatorShape = OperatorShape_;
77  using OperatorElementC = OperatorElementC_;
78  using OperatorFragmentC = OperatorFragmentC_;
80 
82 
84  using Fragment = Array<
85  OperatorElementC,
86  Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
87 
89  using AccumulatorTile = Array<
90  OperatorElementC,
91  OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
92 
94 
96  static int const kIterations = Policy::kIterations;
97 
98 private:
99 
101  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
102 
103 private:
104 
105  //
106  // Data members
107  //
108 
110  AccessType const *accumulators_;
111 
113  int index_;
114 
115 public:
116 
120  accumulators_(reinterpret_cast<AccessType const *>(&accum)),
121  index_(0) {
122  }
123 
127  ++index_;
128  return *this;
129  }
130 
134  --index_;
135  return *this;
136  }
137 
140  void load(Fragment &frag, int index_offset = 0) const {
141 
142  int index = index_ + index_offset;
143 
144  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
145 
147  for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
148 
149  int accumulator_access_offset =
150  index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
151 
152  frag_ptr[n] = accumulators_[accumulator_access_offset];
153  }
154  }
155 };
156 
158 
160 template <
162  typename WarpShape_,
164  typename OperatorShape_,
166  typename OperatorElementC_,
168  typename OperatorFragmentC_,
170  int InterleavedK>
171 class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_,
172  layout::ColumnMajorInterleaved<InterleavedK>> {
173  public:
174  using WarpShape = WarpShape_;
175  using OperatorShape = OperatorShape_;
176  using OperatorElementC = OperatorElementC_;
177  using OperatorFragmentC = OperatorFragmentC_;
178  static int const kInterleavedK = InterleavedK;
180 
182 
184  using Fragment =
185  Array<OperatorElementC,
186  Policy::kElementsPerAccess * InterleavedK / OperatorShape::kN>;
187 
189  using AccumulatorTile =
190  Array<OperatorElementC, OperatorFragmentC::kElements *
191  Policy::OperatorCount::kRow *
192  Policy::OperatorCount::kColumn>;
193 
195  static int const kIterations = Policy::kIterations;
196 
197  private:
199  using AccessType =
200  Array<OperatorElementC, Policy::kElementsPerAccess>;
201 
202  private:
203  //
204  // Data members
205  //
206 
208  AccessType const *accumulators_;
209 
211  int index_;
212 
213  public:
217  : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
218  index_(0) {}
219 
223  ++index_;
224  return *this;
225  }
226 
230  --index_;
231  return *this;
232  }
233 
236  void load(Fragment &frag, int index_offset = 0) const {
237  int index = index_ + index_offset;
238 
239  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
240 
242  for (int n = 0; n < (InterleavedK / OperatorShape::kN); ++n) {
243  int index_m = index % (Policy::OperatorCount::kRow *
244  Policy::kIterationsPerInstruction);
245  int index_n = index / (Policy::OperatorCount::kRow *
246  Policy::kIterationsPerInstruction);
247  int accumulator_access_offset =
248  (index_m / Policy::kIterationsPerInstruction) *
249  (Policy::OperatorCount::kColumn *
250  Policy::kIterationsPerInstruction) +
251  (index_m % Policy::kIterationsPerInstruction) +
252  index_n * (InterleavedK / OperatorShape::kN) *
253  Policy::kIterationsPerInstruction +
254  n * Policy::kIterationsPerInstruction;
255 
256  frag_ptr[n] = accumulators_[accumulator_access_offset];
257  }
258  }
259 };
260 
262 
263 } // namespace warp
264 } // namespace epilogue
265 } // namespace cutlass
266 
CUTLASS_HOST_DEVICE void load(Fragment &frag, int index_offset=0) const
Loads a fragment from the referenced part of the accumulator tile.
Definition: fragment_iterator_tensor_op.h:140
Definition: aligned_buffer.h:35
Defines basic structures needed for implementing the warp-scoped phase of the epilogue. These quantities assume a &#39;column-major&#39; arrangement of TensorOp instructions, of which a row-oriented slice is visible per iteration.
CUTLASS_HOST_DEVICE void load(Fragment &frag, int index_offset=0) const
Loads a fragment from the referenced part of the accumulator tile.
Definition: fragment_iterator_tensor_op.h:236
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
Array< OperatorElementC, OperatorFragmentC::kElements *Policy::OperatorCount::kRow *Policy::OperatorCount::kColumn > AccumulatorTile
This is the complete warp-level accumulator tile.
Definition: fragment_iterator_tensor_op.h:91
Array< OperatorElementC, Policy::OperatorCount::kColumn *Policy::kElementsPerAccess > Fragment
This is the fragment size produced by one access of the iterator.
Definition: fragment_iterator_tensor_op.h:86
Policy details related to the epilogue.
Definition: tensor_op_policy.h:50
CUTLASS_HOST_DEVICE FragmentIteratorTensorOp(AccumulatorTile const &accum)
Constructs an iterator.
Definition: fragment_iterator_tensor_op.h:119
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_HOST_DEVICE FragmentIteratorTensorOp(AccumulatorTile const &accum)
Constructs an iterator.
Definition: fragment_iterator_tensor_op.h:216
Definition: fragment_iterator_tensor_op.h:61
Array< OperatorElementC, Policy::kElementsPerAccess *InterleavedK/OperatorShape::kN > Fragment
This is the fragment size produced by one access of the iterator.
Definition: fragment_iterator_tensor_op.h:186
Mapping function for row-major matrices.
Definition: layout/matrix.h:50
CUTLASS_HOST_DEVICE FragmentIteratorTensorOp & operator++()
Increments.
Definition: fragment_iterator_tensor_op.h:126
Defines layout functions used by TensorRef and derived classes.
Definition: layout/matrix.h:343
Array< OperatorElementC, OperatorFragmentC::kElements *Policy::OperatorCount::kRow *Policy::OperatorCount::kColumn > AccumulatorTile
This is the complete warp-level accumulator tile.
Definition: fragment_iterator_tensor_op.h:192
CUTLASS_HOST_DEVICE FragmentIteratorTensorOp & operator--()
Decrements.
Definition: fragment_iterator_tensor_op.h:133