shared_memory mdspan and accessor#
shared_memory mdspan and accessor allow to express multi-dimensional views of the CUDA shared memory space and provide additional safety checks and performance optimizations.
Types and Traits#
namespace cuda {
template <typename AccessorPolicy>
using shared_memory_accessor;
template <typename ElementType,
typename Extents,
typename LayoutPolicy = cuda::std::layout_right,
typename AccessorPolicy = cuda::shared_memory_accessor<ElementType>>
class shared_memory_mdspan;
} // namespace cuda
mdspan type and accessor tailored for the shared memory space.
namespace cuda {
template <typename T>
inline constexpr bool is_shared_memory_accessor_v = /* true if T is a shared_memory_accessor, false otherwise */;
template <typename T>
inline constexpr bool is_shared_memory_mdspan_v = /* true if T is a shared_memory_mdspan, false otherwise */;
} // namespace cuda
Features#
Constraints
Accessor
data_handle_typemust be a pointer type.
Preconditions
Accessing elements through a
shared_memory_accessoris only allowed in device code.The underlying pointer must be in the shared memory space.
Access offset must be within the maximum possible shared memory allocation size.
Performance considerations
The functionality guarantees that the accesses use shared memory instructions (
STS/LDS) rather than generic memory instructions.
Example#
#include <cuda/mdspan>
#include <cstdio>
__global__ void kernel() {
extern __shared__ int shmem[];
// Create a shared_memory_mdspan over the dynamic shared memory
cuda::shared_memory_mdspan md(shmem, cuda:std::dims<2>{32, 32});
if (threadIdx.x < 32) {
md[threadIdx.x][threadIdx.x] = threadIdx.x; // write on the diagonal
}
__syncthreads();
if (threadIdx.x == 0) {
printf("md[5][5] = %d\n", md[5][5]); // read from the diagonal
}
}
int main() {
kernel<<<1, 32, 32 * 32 * sizeof(int)>>>();
cudaDeviceSynchronize();
}