cuda::device::memcpy_async_tx#
Defined in header <cuda/barrier>:
template <typename T, size_t Alignment>
inline __device__
void cuda::device::memcpy_async_tx(
T* dest,
const T* src,
cuda::aligned_size_t<Alignment> size,
cuda::barrier<cuda::thread_scope_block>& bar);
Copies size bytes from global memory src to shared memory dest and decrements the transaction count of bar by size bytes.
Preconditions#
src,destare 16-byte aligned andsizeis a multiple of 16, i.e.,Alignment >= 16.destpoints to a shared memory allocation that is at leastsizebytes wide.srcpoints to a global memory allocation that is at leastsizebytes wide.baris located in shared memoryIf either
destinationorsourceis an invalid or null pointer, the behavior is undefined (even ifcountis zero).
Requires#
is_trivially_copyable_v<T>is true.
Notes#
This function can only be used under CUDA Compute Capability 9.0 (Hopper) or higher.
There is no feature flag to check if cuda::device::memcpy_async_tx is available.
Comparison to cuda::memcpy_async: memcpy_async_tx supports a subset of the operations of memcpy_async.
It gives more control over the synchronization with a barrier than memcpy_async.
Currently, memcpy_async_tx has no synchronous fallback mechanism., i.e., it currently does not work on older hardware
(pre-CUDA Compute Capability 9.0, i.e., Hopper).
Example#
#include <cuda/barrier>
#include <cuda/std/utility> // cuda::std::move
#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900
static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy_async_tx is not available.");
#endif // __CUDA_MINIMUM_ARCH__
__device__ alignas(16) int gmem_x[2048];
__device__ inline bool elect_one() {
const unsigned int tid = threadIdx.x;
const unsigned int warp_id = tid / 32;
const unsigned int uniform_warp_id = __shfl_sync(0xFFFFFFFF, warp_id, 0); // broadcast from lane 0
return (uniform_warp_id == 0 && cuda::ptx::elect_sync(0xFFFFFFFF)); // elect a leader thread among warp 0
}
__global__ void example_kernel() {
alignas(16) __shared__ int smem_x[1024];
#pragma nv_diag_suppress static_var_with_dynamic_init
__shared__ cuda::barrier<cuda::thread_scope_block> bar;
// setup the mbarrier
if (threadIdx.x == 0) {
init(&bar, blockDim.x);
}
__syncthreads();
// issue the async copy from a single thread and wait for completion
const bool is_block_leader = elect_one();
const int tx_count = is_block_leader ? sizeof(smem_x) : 0;
if (is_block_leader) {
cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(tx_count), bar);
}
auto token = cuda::device::barrier_arrive_tx(bar, 1, tx_count);
bar.wait(cuda::std::move(token));
// smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
smem_x[threadIdx.x] += 1;
}