CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
default_thread_map_tensor_op.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
30 #pragma once
31 
33 #include "cutlass/gemm/gemm.h"
35 
37 
38 namespace cutlass {
39 namespace epilogue {
40 namespace threadblock {
41 
43 
45 template <
46  typename ThreadblockShape_,
47  typename WarpShape_,
48  int PartitionsK,
49  typename Element_,
50  int ElementsPerAccess
51 >
53 
54  using ThreadblockShape = ThreadblockShape_;
55  using WarpShape = WarpShape_;
56  static int const kPartitionsK = PartitionsK;
57  using Element = Element_;
58  static int const kElementsPerAccess = ElementsPerAccess;
59 
60  //
61  // Definitions
62  //
63 
64  struct Detail {
65 
67  static int const kTensorOpRows = 8;
68  static int const kWarpSize = 32;
69 
71  !(ThreadblockShape::kM % WarpShape::kM) &&
72  !(ThreadblockShape::kM % WarpShape::kM), "Divisibility");
73 
75  using WarpCount = gemm::GemmShape<
76  ThreadblockShape::kM / WarpShape::kM,
77  ThreadblockShape::kN / WarpShape::kN,
78  kPartitionsK
79  >;
80 
82  static int const kThreads = WarpCount::kCount * kWarpSize;
83  };
84 
85  //
86  // ThreadMap
87  //
88 
92  OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
96  >;
97 };
98 
100 
102 template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
103  typename Element_, int ElementsPerAccess, int InterleavedK>
105  using ThreadblockShape = ThreadblockShape_;
106  using WarpShape = WarpShape_;
107  static int const kPartitionsK = PartitionsK;
108  using Element = Element_;
109  static int const kElementsPerAccess = ElementsPerAccess;
110  static int const kInterleavedK = InterleavedK;
111 
112  //
113  // Definitions
114  //
115 
116  struct Detail {
118  static int const kTensorOpRows = 8;
119  static int const kWarpSize = 32;
120 
121  static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
122  !(ThreadblockShape::kM % WarpShape::kM),
123  "Divisibility");
124 
126  using WarpCount =
127  gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
128  ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
129 
131  static int const kThreads = WarpCount::kCount * kWarpSize;
132  };
133 
134  //
135  // ThreadMap
136  //
137 
143  WarpShape::kN / InterleavedK>,
145 };
146 
148 
149 } // namespace threadblock
150 } // namespace epilogue
151 } // namespace cutlass
152 
Definition: default_thread_map_tensor_op.h:64
Definition: output_tile_thread_map.h:228
static int const kWarpSize
Definition: default_thread_map_tensor_op.h:68
Definition: aligned_buffer.h:35
Tuple defining point in output tile.
Definition: output_tile_thread_map.h:57
Epilogue for threadblock scoped GEMMs using Tensor Ops.
static int const kPartitionsK
Definition: default_thread_map_tensor_op.h:56
Defines common types used for all GEMM-like operators.
Element_ Element
Definition: default_thread_map_tensor_op.h:108
static int const kCount
Definition: include/cutlass/gemm/gemm.h:67
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
static int const kThreads
Number of participating threads.
Definition: default_thread_map_tensor_op.h:82
Defines the optimal thread map for TensorOp accumulator layouts.
Definition: default_thread_map_tensor_op.h:104
Defines the size of an element in bits.
Definition: numeric_types.h:42
Defines the optimal thread map for TensorOp accumulator layouts.
Definition: default_thread_map_tensor_op.h:52
static int const kTensorOpRows
Tensor Operations fundamentally perform operations on 8 rows.
Definition: default_thread_map_tensor_op.h:67
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57
ThreadblockShape_ ThreadblockShape
Definition: default_thread_map_tensor_op.h:105
#define static_assert(__e, __m)
Definition: platform.h:153
Element_ Element
Definition: default_thread_map_tensor_op.h:57
WarpShape_ WarpShape
Definition: default_thread_map_tensor_op.h:106
WarpShape_ WarpShape
Definition: default_thread_map_tensor_op.h:55
ThreadblockShape_ ThreadblockShape
Definition: default_thread_map_tensor_op.h:54
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
static int const kElementsPerAccess
Definition: default_thread_map_tensor_op.h:58