CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
tile_iterator_tensor_op.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/array.h"
32 #include "cutlass/layout/matrix.h"
34 
36 
38 
39 namespace cutlass {
40 namespace epilogue {
41 namespace warp {
42 
44 
46 template <
47  typename WarpShape,
48  typename OperatorShape,
49  typename Element,
50  typename Layout
51 >
53 
55 
57 template <
58  typename WarpShape_,
59  typename OperatorShape_,
60  typename Element_
61 >
62 class TileIteratorTensorOp<WarpShape_, OperatorShape_, Element_, layout::RowMajor> {
63 public:
64 
65  using WarpShape = WarpShape_;
66  using OperatorShape = OperatorShape_;
67  using Element = Element_;
69 
72  using Index = typename TensorRef::Index;
73  using LongIndex = typename TensorRef::LongIndex;
74 
76 
78  using Shape = MatrixShape<
79  Policy::kRowsPerIteration,
80  WarpShape::kN
81  >;
82 
84  using Fragment = Array<
85  Element,
86  Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
87 
89  //using AccumulatorTile = typename Operator::FragmentC;
90 
92  static int const kIterations = Policy::kIterations;
93 
94  // Internal constants
95  struct Detail {
96  static int const kLanesInQuad = 4;
97  };
98 
100  using Padding = MatrixShape<
101  0,
102  Detail::kLanesInQuad * Policy::kElementsPerAccess>;
103 
104 private:
105 
108 
109  //
110  // Data members
111  //
112 
114  AccessType *pointer_;
115 
117  Layout layout_;
118 
119 public:
120 
123  TileIteratorTensorOp(): pointer_(nullptr) { }
124 
128  TensorRef const &ref,
129  unsigned lane_id
130  ):
131  pointer_(reinterpret_cast<AccessType *>(ref.data())),
132  layout_(ref.stride()[0] / Policy::kElementsPerAccess) {
133 
134  int quad_id = (lane_id / Detail::kLanesInQuad);
135  int lane_in_quad = (lane_id % Detail::kLanesInQuad);
136 
137  pointer_ += layout_({quad_id, lane_in_quad});
138  }
139 
143  pointer_ += pointer_offset / Policy::kElementsPerAccess;
144  return *this;
145  }
146 
150 
151  pointer_ += layout_({
152  tile_offset.row() * Shape::kRow,
153  (tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess)
154  });
155 
156  return *this;
157  }
158 
162  add_tile_offset(tile_offset);
163  return *this;
164  }
165 
168  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
169 
170  AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
171 
173  for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
174  pointer_[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess] = frag_ptr[n];
175  }
176  }
177 
180  void store(Fragment const &frag) {
181  store_with_pointer_offset(frag, 0);
182  }
183 
186  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
187 
188  AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
189 
191  for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
192  frag_ptr[n] = pointer_[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess];
193  }
194  }
195 
198  void load(Fragment &frag) const {
199  load_with_pointer_offset(frag, 0);
200  }
201 };
202 
204 
205 } // namespace warp
206 } // namespace epilogue
207 } // namespace cutlass
208 
Describes the size of a matrix tile.
Definition: matrix_shape.h:42
CUTLASS_HOST_DEVICE Index const & column() const
Returns the column of the coordinate.
Definition: matrix_coord.h:85
Definition: aligned_buffer.h:35
Defines basic structures needed for implementing the warp-scoped phase of the epilogue. These quantities assume a &#39;column-major&#39; arrangement of TensorOp instructions, of which a row-oriented slice is visible per iteration.
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const
Load.
Definition: tile_iterator_tensor_op.h:186
CUTLASS_HOST_DEVICE TileIteratorTensorOp(TensorRef const &ref, unsigned lane_id)
Constructor from TensorRef.
Definition: tile_iterator_tensor_op.h:127
CUTLASS_HOST_DEVICE TileIteratorTensorOp & add_pointer_offset(Index pointer_offset)
Adds a pointer offset.
Definition: tile_iterator_tensor_op.h:142
CUTLASS_HOST_DEVICE TileIteratorTensorOp & add_tile_offset(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: tile_iterator_tensor_op.h:149
Aligned array type.
Definition: array.h:511
CUTLASS_HOST_DEVICE Index const & row() const
Returns the row of the coordinate.
Definition: matrix_coord.h:77
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store.
Definition: tile_iterator_tensor_op.h:168
CUTLASS_HOST_DEVICE TileIteratorTensorOp & operator+=(TensorCoord const &tile_offset)
Definition: tile_iterator_tensor_op.h:161
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Store.
Definition: tile_iterator_tensor_op.h:180
#define nullptr
nullptr
Definition: platform.h:144
Policy details related to the epilogue.
Definition: tensor_op_policy.h:50
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
Template for reading and writing tiles of accumulators to shared memory.
Definition: tile_iterator_tensor_op.h:52
typename TensorRef::LongIndex LongIndex
Definition: tile_iterator_tensor_op.h:73
typename Layout::Index Index
Index type.
Definition: tensor_ref.h:165
Mapping function for row-major matrices.
Definition: layout/matrix.h:50
CUTLASS_HOST_DEVICE TileIteratorTensorOp()
Default constructor.
Definition: tile_iterator_tensor_op.h:123
Array< Element, Policy::OperatorCount::kColumn *Policy::kElementsPerAccess > Fragment
This is the fragment size produced by one access of the iterator.
Definition: tile_iterator_tensor_op.h:86
Defines layout functions used by TensorRef and derived classes.
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
CUTLASS_HOST_DEVICE void load(Fragment &frag) const
Load.
Definition: tile_iterator_tensor_op.h:198
Definition: matrix_coord.h:39
typename Layout::LongIndex LongIndex
Long index used for pointer offsets.
Definition: tensor_ref.h:168