CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
default_thread_map_volta_tensor_op.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
30 #pragma once
31 
33 #include "cutlass/gemm/gemm.h"
34 
36 
37 namespace cutlass {
38 namespace epilogue {
39 namespace threadblock {
40 
42 
44 template <
45  typename ThreadblockShape,
46  typename WarpShape,
47  int PartitionsK,
48  typename ElementOutput,
49  int ElementsPerAccess,
50  typename ElementAccumulator
51 >
53 
55 
57 template <
58  typename ThreadblockShape_,
59  typename WarpShape_,
60  int PartitionsK,
61  typename ElementOutput_,
62  int ElementsPerAccess
63 >
65  ThreadblockShape_,
66  WarpShape_,
67  PartitionsK,
68  ElementOutput_,
69  ElementsPerAccess,
70  half_t> {
71 
72  using ThreadblockShape = ThreadblockShape_;
73  using WarpShape = WarpShape_;
74  static int const kPartitionsK = PartitionsK;
75  using ElementOutput = ElementOutput_;
76  static int const kElementsPerAccess = ElementsPerAccess;
78 
79  //
80  // Definitions
81  //
82 
83  struct Detail {
84 
85  static int const kTensorOpRows = 16;
86  static int const kWarpSize = 32;
87  static int const kInterleavedTilesM = WarpShape::kM / 32;
88 
90  !(ThreadblockShape::kM % WarpShape::kM) &&
91  !(ThreadblockShape::kM % WarpShape::kM), "Divisibility");
92 
94  using WarpCount = gemm::GemmShape<
95  ThreadblockShape::kM / WarpShape::kM,
96  ThreadblockShape::kN / WarpShape::kN,
97  kPartitionsK
98  >;
99 
101  static int const kThreads = WarpCount::kCount * kWarpSize;
102 
104  ThreadblockShape::kN, // column
105  4, // row
106  4, // group
107  WarpCount::kM, // cluster
108  1 // tile
109  >;
110 
113  1, // column
114  2, // row
115  kInterleavedTilesM, // group
116  1, // cluster
117  WarpShape::kM / kTensorOpRows // iterations
118  >;
119  };
120 
121  //
122  // ThreadMap
123  //
124 
127  typename Detail::Shape,
128  typename Detail::Count,
129  Detail::kThreads,
130  kElementsPerAccess,
132  >;
133 };
134 
136 
138 template <
139  typename ThreadblockShape_,
140  typename WarpShape_,
141  int PartitionsK,
142  typename ElementOutput_,
143  int ElementsPerAccess
144 >
146  ThreadblockShape_,
147  WarpShape_,
148  PartitionsK,
149  ElementOutput_,
150  ElementsPerAccess,
151  float> {
152 
153  using ThreadblockShape = ThreadblockShape_;
154  using WarpShape = WarpShape_;
155  static int const kPartitionsK = PartitionsK;
156  using ElementOutput = ElementOutput_;
157  static int const kElementsPerAccess = ElementsPerAccess;
158  using ElementAccumulator = float;
159 
160  //
161  // Definitions
162  //
163 
164  struct Detail {
165 
166  static int const kTensorOpRows = 16;
167  static int const kWarpSize = 32;
168  static int const kInterleavedTilesM = WarpShape::kM / 32;
169 
171  !(ThreadblockShape::kM % WarpShape::kM) &&
172  !(ThreadblockShape::kM % WarpShape::kM), "Divisibility");
173 
175  using WarpCount = gemm::GemmShape<
176  ThreadblockShape::kM / WarpShape::kM,
177  ThreadblockShape::kN / WarpShape::kN,
178  kPartitionsK
179  >;
180 
182  static int const kThreads = WarpCount::kCount * kWarpSize;
183 
185  ThreadblockShape::kN, // column
186  4, // row
187  4, // group
188  WarpCount::kM, // cluster
189  1 // tile
190  >;
191 
194  1, // column
195  2, // row
196  kInterleavedTilesM, // group
197  1, // cluster
198  WarpShape::kM / kTensorOpRows // iterations
199  >;
200  };
201 
202  //
203  // ThreadMap
204  //
205 
208  typename Detail::Shape,
209  typename Detail::Count,
210  Detail::kThreads,
211  kElementsPerAccess,
213  >;
214 };
215 
217 
218 } // namespace threadblock
219 } // namespace epilogue
220 } // namespace cutlass
221 
Definition: output_tile_thread_map.h:228
Definition: aligned_buffer.h:35
Tuple defining point in output tile.
Definition: output_tile_thread_map.h:57
Epilogue for threadblock scoped GEMMs using Tensor Ops.
IEEE half-precision floating-point type.
Definition: half.h:126
Defines common types used for all GEMM-like operators.
Defines the size of an element in bits.
Definition: numeric_types.h:42
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57
#define static_assert(__e, __m)
Definition: platform.h:153
Defines the optimal thread map for TensorOp accumulator layouts.
Definition: default_thread_map_volta_tensor_op.h:52