mbarrier.arrive
PTX ISA: mbarrier.arrive
mbarrier.arrive
Some of the listed PTX instructions below are semantically equivalent. They differ in one important way: the shorter instructions are typically supported on older compilers.
mbarrier.arrive.release.cta.shared::cta.b64
// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90
// .sem = { .release }
// .scope = { .cta, .cluster }
// .space = { .shared::cta }
template <cuda::ptx::dot_scope Scope>
__device__ static inline uint64_t mbarrier_arrive(
cuda::ptx::sem_release_t,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::space_shared_t,
uint64_t* addr,
const uint32_t& count);
mbarrier.arrive.release.cluster.shared::cta.b64
// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90
// .sem = { .release }
// .scope = { .cta, .cluster }
// .space = { .shared::cta }
template <cuda::ptx::dot_scope Scope>
__device__ static inline uint64_t mbarrier_arrive(
cuda::ptx::sem_release_t,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::space_shared_t,
uint64_t* addr,
const uint32_t& count);
mbarrier.arrive.release.cluster.shared::cluster.b64
// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90
// .sem = { .release }
// .scope = { .cluster }
// .space = { .shared::cluster }
template <typename = void>
__device__ static inline void mbarrier_arrive(
cuda::ptx::sem_release_t,
cuda::ptx::scope_cluster_t,
cuda::ptx::space_cluster_t,
uint64_t* addr,
const uint32_t& count);
mbarrier.arrive.no_complete
mbarrier.arrive.expect_tx
Usage
#include <cuda/ptx>
#include <cuda/barrier>
#include <cooperative_groups.h>
__global__ void kernel() {
using cuda::ptx::sem_release;
using cuda::ptx::space_cluster;
using cuda::ptx::space_shared;
using cuda::ptx::scope_cluster;
using cuda::ptx::scope_cta;
using barrier_t = cuda::barrier<cuda::thread_scope_block>;
__shared__ barrier_t bar;
init(&bar, blockDim.x);
__syncthreads();
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// Arrive on local shared memory barrier:
uint64_t token;
token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
// Get address of remote cluster barrier:
namespace cg = cooperative_groups;
cg::cluster_group cluster = cg::this_cluster();
unsigned int other_block_rank = cluster.block_rank() ^ 1;
uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank);
// Sync cluster to ensure remote barrier is initialized.
cluster.sync();
// Arrive on remote cluster barrier:
cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1);
)
}