45 inline __device__
void ldsm(Array<unsigned, MatrixCount> & D,
void const* ptr);
53 #if (__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ == 2) 54 #define CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED 1 56 #define CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED 0 59 #if ! defined(CUDA_NVVM_GET_SHARED_POINTER_ENABLED) 60 #define CUDA_NVVM_GET_SHARED_POINTER_ENABLED (CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED) 63 #if ! defined(CUDA_LDMATRIX_SUPPORTED) 64 #define CUDA_LDMATRIX_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 2)) 67 #if ! defined(CUDA_LDMATRIX_ENABLED) 68 #define CUDA_LDMATRIX_ENABLED (CUDA_LDMATRIX_SUPPORTED) 71 #if (CUDA_LDMATRIX_ENABLED && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)) 72 #define CUDA_LDMATRIX_ACTIVATED 1 74 #define CUDA_LDMATRIX_ACTIVATED 0 77 #if defined(CUTLASS_GET_SMEM_POINTER) 79 #elif CUDA_NVVM_GET_SHARED_POINTER_ENABLED 80 #if ! defined(NVVM_GET_SMEM_POINTER) 81 #define NVVM_GET_SMEM_POINTER 88 __device__ uint32_t __nvvm_get_smem_pointer(
void*);
91 #define CUTLASS_GET_SMEM_POINTER(ptr) __nvvm_get_smem_pointer((void*)ptr) 97 inline __device__
void ldsm<layout::RowMajor, 1>(
98 Array<unsigned, 1> & D,
101 #if CUDA_LDMATRIX_ACTIVATED 103 unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
106 asm volatile (
"ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" :
"=r"(x) :
"r"(addr));
107 reinterpret_cast<int &
>(D) = x;
119 inline __device__
void ldsm<layout::RowMajor, 2>(
120 Array<unsigned, 2> & D,
123 #if CUDA_LDMATRIX_ACTIVATED 125 unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
128 asm volatile (
"ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" :
"=r"(x),
"=r"(y) :
"r"(addr));
129 reinterpret_cast<int2 &
>(D) = make_int2(x, y);
141 inline __device__
void ldsm<layout::RowMajor, 4>(
142 Array<unsigned, 4> & D,
145 #if CUDA_LDMATRIX_ACTIVATED 147 unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
150 asm volatile (
"ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" :
"=r"(x),
"=r"(y),
"=r"(z),
"=r"(w) :
"r"(addr));
151 reinterpret_cast<int4 &
>(D) = make_int4(x, y, z, w);
167 inline __device__
void ldsm<layout::ColumnMajor, 1>(
168 Array<unsigned, 1> & D,
170 #if CUDA_LDMATRIX_ACTIVATED 172 unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
175 asm volatile (
"ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 {%0}, [%1];" :
"=r"(x) :
"r"(addr));
176 reinterpret_cast<int &
>(D) = x;
188 inline __device__
void ldsm<layout::ColumnMajor, 2>(
189 Array<unsigned, 2> & D,
192 #if CUDA_LDMATRIX_ACTIVATED 194 unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
197 asm volatile (
"ldmatrix.sync.aligned.x2.trans.m8n8.shared.b16 {%0, %1}, [%2];" :
"=r"(x),
"=r"(y) :
"r"(addr));
198 reinterpret_cast<int2 &
>(D) = make_int2(x, y);
210 inline __device__
void ldsm<layout::ColumnMajor, 4>(
211 Array<unsigned, 4> & D,
214 #if CUDA_LDMATRIX_ACTIVATED 216 unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
219 asm volatile (
"ldmatrix.sync.aligned.x4.trans.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" :
"=r"(x),
"=r"(y),
"=r"(z),
"=r"(w) :
"r"(addr));
220 reinterpret_cast<int4 &
>(D) = make_int4(x, y, z, w);
Definition: aligned_buffer.h:35
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
__device__ void ldsm(Array< unsigned, MatrixCount > &D, void const *ptr)
Defines layout functions used by TensorRef and derived classes.