CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
memory_sm75.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/array.h"
32 #include "cutlass/layout/matrix.h"
33 
34 namespace cutlass {
35 namespace arch {
36 
38 
39 template <
41  typename Layout,
43  int MatrixCount
44 >
45 inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
46 
48 //
49 // Specializations
50 //
52 
53 #if (__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ == 2)
54  #define CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED 1
55 #else
56  #define CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED 0
57 #endif
58 
59 #if ! defined(CUDA_NVVM_GET_SHARED_POINTER_ENABLED)
60  #define CUDA_NVVM_GET_SHARED_POINTER_ENABLED (CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED)
61 #endif
62 
63 #if ! defined(CUDA_LDMATRIX_SUPPORTED)
64  #define CUDA_LDMATRIX_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 2))
65 #endif
66 
67 #if ! defined(CUDA_LDMATRIX_ENABLED)
68  #define CUDA_LDMATRIX_ENABLED (CUDA_LDMATRIX_SUPPORTED)
69 #endif
70 
71 #if (CUDA_LDMATRIX_ENABLED && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
72  #define CUDA_LDMATRIX_ACTIVATED 1
73 #else
74  #define CUDA_LDMATRIX_ACTIVATED 0
75 #endif
76 
77 #if defined(CUTLASS_GET_SMEM_POINTER)
78  // Use the existing implementation
79 #elif CUDA_NVVM_GET_SHARED_POINTER_ENABLED
80  #if ! defined(NVVM_GET_SMEM_POINTER)
81  #define NVVM_GET_SMEM_POINTER
82  extern "C" {
83  //
84  // This NVVM intrinsic is subject to change in future versions of CUDA.
85  // Clients should not call it directly. Rather, they should use the
86  // cutlass::arch::ldsm<>() template.
87  //
88  __device__ uint32_t __nvvm_get_smem_pointer(void*);
89  }
90  #endif
91  #define CUTLASS_GET_SMEM_POINTER(ptr) __nvvm_get_smem_pointer((void*)ptr)
92 #endif
93 
95 
96 template <>
97 inline __device__ void ldsm<layout::RowMajor, 1>(
98  Array<unsigned, 1> & D,
99  void const* ptr) {
100 
101  #if CUDA_LDMATRIX_ACTIVATED
102 
103  unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
104 
105  int x;
106  asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));
107  reinterpret_cast<int &>(D) = x;
108 
109  #else
110 
111  assert(0);
112 
113  #endif
114 }
115 
117 
118 template <>
119 inline __device__ void ldsm<layout::RowMajor, 2>(
120  Array<unsigned, 2> & D,
121  void const* ptr) {
122 
123  #if CUDA_LDMATRIX_ACTIVATED
124 
125  unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
126 
127  int x, y;
128  asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(x), "=r"(y) : "r"(addr));
129  reinterpret_cast<int2 &>(D) = make_int2(x, y);
130 
131  #else
132 
133  assert(0);
134 
135  #endif
136 }
137 
139 
140 template <>
141 inline __device__ void ldsm<layout::RowMajor, 4>(
142  Array<unsigned, 4> & D,
143  void const* ptr) {
144 
145  #if CUDA_LDMATRIX_ACTIVATED
146 
147  unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
148 
149  int x, y, z, w;
150  asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(x), "=r"(y), "=r"(z), "=r"(w) : "r"(addr));
151  reinterpret_cast<int4 &>(D) = make_int4(x, y, z, w);
152 
153  #else
154 
155  assert(0);
156 
157  #endif
158 }
159 
161 //
162 // Transpose on 16b granularity
163 //
165 
166 template <>
167 inline __device__ void ldsm<layout::ColumnMajor, 1>(
168  Array<unsigned, 1> & D,
169  void const* ptr) {
170  #if CUDA_LDMATRIX_ACTIVATED
171 
172  unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
173 
174  int x;
175  asm volatile ("ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));
176  reinterpret_cast<int &>(D) = x;
177 
178  #else
179 
180  assert(0);
181 
182  #endif
183 }
184 
186 
187 template <>
188 inline __device__ void ldsm<layout::ColumnMajor, 2>(
189  Array<unsigned, 2> & D,
190  void const* ptr) {
191 
192  #if CUDA_LDMATRIX_ACTIVATED
193 
194  unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
195 
196  int x, y;
197  asm volatile ("ldmatrix.sync.aligned.x2.trans.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(x), "=r"(y) : "r"(addr));
198  reinterpret_cast<int2 &>(D) = make_int2(x, y);
199 
200  #else
201 
202  assert(0);
203 
204  #endif
205 }
206 
208 
209 template <>
210 inline __device__ void ldsm<layout::ColumnMajor, 4>(
211  Array<unsigned, 4> & D,
212  void const* ptr) {
213 
214  #if CUDA_LDMATRIX_ACTIVATED
215 
216  unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
217 
218  int x, y, z, w;
219  asm volatile ("ldmatrix.sync.aligned.x4.trans.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(x), "=r"(y), "=r"(z), "=r"(w) : "r"(addr));
220  reinterpret_cast<int4 &>(D) = make_int4(x, y, z, w);
221 
222  #else
223 
224  assert(0);
225 
226  #endif
227 }
228 
230 } // namespace arch
231 } // namespace cutlass
Definition: aligned_buffer.h:35
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
__device__ void ldsm(Array< unsigned, MatrixCount > &D, void const *ptr)
Defines layout functions used by TensorRef and derived classes.