cp.async.bulk
PTX ISA: cp.async.bulk
Implementation notes
NOTE. Both srcMem
and dstMem
must be 16-byte aligned, and
size
must be a multiple of 16.
Changelog
In earlier versions,
cp_async_bulk_multicast
was enabled for SM_90. This has been changed to SM_90a.
Unicast
cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .global }
template <typename = void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
void* dstMem,
const void* srcMem,
const uint32_t& size,
uint64_t* smem_bar);
cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 86, SM_90
// .dst = { .shared::cta }
// .src = { .global }
template <typename = void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_shared_t,
cuda::ptx::space_global_t,
void* dstMem,
const void* srcMem,
const uint32_t& size,
uint64_t* smem_bar);
cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .shared::cta }
template <typename = void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_cluster_t,
cuda::ptx::space_shared_t,
void* dstMem,
const void* srcMem,
const uint32_t& size,
uint64_t* rdsmem_bar);
cp.async.bulk.global.shared::cta.bulk_group
// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // PTX ISA 80, SM_90
// .dst = { .global }
// .src = { .shared::cta }
template <typename = void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_global_t,
cuda::ptx::space_shared_t,
void* dstMem,
const void* srcMem,
const uint32_t& size);
cp.async.bulk.global.shared::cta.bulk_group.cp_mask
// cp.async.bulk.dst.src.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; // PTX ISA 86, SM_100
// .dst = { .global }
// .src = { .shared::cta }
template <typename = void>
__device__ static inline void cp_async_bulk_cp_mask(
cuda::ptx::space_global_t,
cuda::ptx::space_shared_t,
void* dstMem,
const void* srcMem,
const uint32_t& size,
const uint16_t& byteMask);
Multicast
cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename = void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
void* dstMem,
const void* srcMem,
const uint32_t& size,
uint64_t* smem_bar,
const uint16_t& ctaMask);