multimem.red
PTX ISA: multimem.red
multimem.red.relaxed.cta.global.min.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint32_t* addr,
uint32_t val);
multimem.red.relaxed.cluster.global.min.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint32_t* addr,
uint32_t val);
multimem.red.relaxed.gpu.global.min.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint32_t* addr,
uint32_t val);
multimem.red.relaxed.sys.global.min.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint32_t* addr,
uint32_t val);
multimem.red.release.cta.global.min.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint32_t* addr,
uint32_t val);
multimem.red.release.cluster.global.min.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint32_t* addr,
uint32_t val);
multimem.red.release.gpu.global.min.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint32_t* addr,
uint32_t val);
multimem.red.release.sys.global.min.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint32_t* addr,
uint32_t val);
multimem.red.relaxed.cta.global.min.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint64_t* addr,
uint64_t val);
multimem.red.relaxed.cluster.global.min.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint64_t* addr,
uint64_t val);
multimem.red.relaxed.gpu.global.min.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint64_t* addr,
uint64_t val);
multimem.red.relaxed.sys.global.min.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint64_t* addr,
uint64_t val);
multimem.red.release.cta.global.min.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint64_t* addr,
uint64_t val);
multimem.red.release.cluster.global.min.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint64_t* addr,
uint64_t val);
multimem.red.release.gpu.global.min.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint64_t* addr,
uint64_t val);
multimem.red.release.sys.global.min.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
uint64_t* addr,
uint64_t val);
multimem.red.relaxed.cta.global.min.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int32_t* addr,
int32_t val);
multimem.red.relaxed.cluster.global.min.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int32_t* addr,
int32_t val);
multimem.red.relaxed.gpu.global.min.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int32_t* addr,
int32_t val);
multimem.red.relaxed.sys.global.min.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int32_t* addr,
int32_t val);
multimem.red.release.cta.global.min.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int32_t* addr,
int32_t val);
multimem.red.release.cluster.global.min.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int32_t* addr,
int32_t val);
multimem.red.release.gpu.global.min.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int32_t* addr,
int32_t val);
multimem.red.release.sys.global.min.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int32_t* addr,
int32_t val);
multimem.red.relaxed.cta.global.min.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int64_t* addr,
int64_t val);
multimem.red.relaxed.cluster.global.min.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int64_t* addr,
int64_t val);
multimem.red.relaxed.gpu.global.min.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int64_t* addr,
int64_t val);
multimem.red.relaxed.sys.global.min.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int64_t* addr,
int64_t val);
multimem.red.release.cta.global.min.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int64_t* addr,
int64_t val);
multimem.red.release.cluster.global.min.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int64_t* addr,
int64_t val);
multimem.red.release.gpu.global.min.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int64_t* addr,
int64_t val);
multimem.red.release.sys.global.min.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .min }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_min_t,
int64_t* addr,
int64_t val);
multimem.red.relaxed.cta.global.max.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint32_t* addr,
uint32_t val);
multimem.red.relaxed.cluster.global.max.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint32_t* addr,
uint32_t val);
multimem.red.relaxed.gpu.global.max.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint32_t* addr,
uint32_t val);
multimem.red.relaxed.sys.global.max.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint32_t* addr,
uint32_t val);
multimem.red.release.cta.global.max.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint32_t* addr,
uint32_t val);
multimem.red.release.cluster.global.max.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint32_t* addr,
uint32_t val);
multimem.red.release.gpu.global.max.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint32_t* addr,
uint32_t val);
multimem.red.release.sys.global.max.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint32_t* addr,
uint32_t val);
multimem.red.relaxed.cta.global.max.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint64_t* addr,
uint64_t val);
multimem.red.relaxed.cluster.global.max.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint64_t* addr,
uint64_t val);
multimem.red.relaxed.gpu.global.max.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint64_t* addr,
uint64_t val);
multimem.red.relaxed.sys.global.max.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint64_t* addr,
uint64_t val);
multimem.red.release.cta.global.max.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint64_t* addr,
uint64_t val);
multimem.red.release.cluster.global.max.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint64_t* addr,
uint64_t val);
multimem.red.release.gpu.global.max.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint64_t* addr,
uint64_t val);
multimem.red.release.sys.global.max.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
uint64_t* addr,
uint64_t val);
multimem.red.relaxed.cta.global.max.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int32_t* addr,
int32_t val);
multimem.red.relaxed.cluster.global.max.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int32_t* addr,
int32_t val);
multimem.red.relaxed.gpu.global.max.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int32_t* addr,
int32_t val);
multimem.red.relaxed.sys.global.max.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int32_t* addr,
int32_t val);
multimem.red.release.cta.global.max.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int32_t* addr,
int32_t val);
multimem.red.release.cluster.global.max.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int32_t* addr,
int32_t val);
multimem.red.release.gpu.global.max.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int32_t* addr,
int32_t val);
multimem.red.release.sys.global.max.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int32_t* addr,
int32_t val);
multimem.red.relaxed.cta.global.max.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int64_t* addr,
int64_t val);
multimem.red.relaxed.cluster.global.max.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int64_t* addr,
int64_t val);
multimem.red.relaxed.gpu.global.max.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int64_t* addr,
int64_t val);
multimem.red.relaxed.sys.global.max.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int64_t* addr,
int64_t val);
multimem.red.release.cta.global.max.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int64_t* addr,
int64_t val);
multimem.red.release.cluster.global.max.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int64_t* addr,
int64_t val);
multimem.red.release.gpu.global.max.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int64_t* addr,
int64_t val);
multimem.red.release.sys.global.max.s64
// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .max }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_max_t,
int64_t* addr,
int64_t val);
multimem.red.relaxed.cta.global.add.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint32_t* addr,
uint32_t val);
multimem.red.relaxed.cluster.global.add.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint32_t* addr,
uint32_t val);
multimem.red.relaxed.gpu.global.add.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint32_t* addr,
uint32_t val);
multimem.red.relaxed.sys.global.add.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint32_t* addr,
uint32_t val);
multimem.red.release.cta.global.add.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint32_t* addr,
uint32_t val);
multimem.red.release.cluster.global.add.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint32_t* addr,
uint32_t val);
multimem.red.release.gpu.global.add.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint32_t* addr,
uint32_t val);
multimem.red.release.sys.global.add.u32
// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint32_t* addr,
uint32_t val);
multimem.red.relaxed.cta.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint64_t* addr,
uint64_t val);
multimem.red.relaxed.cluster.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint64_t* addr,
uint64_t val);
multimem.red.relaxed.gpu.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint64_t* addr,
uint64_t val);
multimem.red.relaxed.sys.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint64_t* addr,
uint64_t val);
multimem.red.release.cta.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint64_t* addr,
uint64_t val);
multimem.red.release.cluster.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint64_t* addr,
uint64_t val);
multimem.red.release.gpu.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint64_t* addr,
uint64_t val);
multimem.red.release.sys.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
uint64_t* addr,
uint64_t val);
multimem.red.relaxed.cta.global.add.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int32_t* addr,
int32_t val);
multimem.red.relaxed.cluster.global.add.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int32_t* addr,
int32_t val);
multimem.red.relaxed.gpu.global.add.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int32_t* addr,
int32_t val);
multimem.red.relaxed.sys.global.add.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int32_t* addr,
int32_t val);
multimem.red.release.cta.global.add.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int32_t* addr,
int32_t val);
multimem.red.release.cluster.global.add.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int32_t* addr,
int32_t val);
multimem.red.release.gpu.global.add.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int32_t* addr,
int32_t val);
multimem.red.release.sys.global.add.s32
// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int32_t* addr,
int32_t val);
multimem.red.relaxed.cta.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int64_t* addr,
int64_t val);
multimem.red.relaxed.cluster.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int64_t* addr,
int64_t val);
multimem.red.relaxed.gpu.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int64_t* addr,
int64_t val);
multimem.red.relaxed.sys.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int64_t* addr,
int64_t val);
multimem.red.release.cta.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int64_t* addr,
int64_t val);
multimem.red.release.cluster.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int64_t* addr,
int64_t val);
multimem.red.release.gpu.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int64_t* addr,
int64_t val);
multimem.red.release.sys.global.add.u64
// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .add }
template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_add_t,
int64_t* addr,
int64_t val);
multimem.red.relaxed.cta.global.and.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B32* addr,
B32 val);
multimem.red.relaxed.cluster.global.and.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B32* addr,
B32 val);
multimem.red.relaxed.gpu.global.and.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B32* addr,
B32 val);
multimem.red.relaxed.sys.global.and.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B32* addr,
B32 val);
multimem.red.release.cta.global.and.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B32* addr,
B32 val);
multimem.red.release.cluster.global.and.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B32* addr,
B32 val);
multimem.red.release.gpu.global.and.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B32* addr,
B32 val);
multimem.red.release.sys.global.and.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B32* addr,
B32 val);
multimem.red.relaxed.cta.global.or.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B32* addr,
B32 val);
multimem.red.relaxed.cluster.global.or.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B32* addr,
B32 val);
multimem.red.relaxed.gpu.global.or.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B32* addr,
B32 val);
multimem.red.relaxed.sys.global.or.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B32* addr,
B32 val);
multimem.red.release.cta.global.or.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B32* addr,
B32 val);
multimem.red.release.cluster.global.or.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B32* addr,
B32 val);
multimem.red.release.gpu.global.or.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B32* addr,
B32 val);
multimem.red.release.sys.global.or.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B32* addr,
B32 val);
multimem.red.relaxed.cta.global.xor.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B32* addr,
B32 val);
multimem.red.relaxed.cluster.global.xor.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B32* addr,
B32 val);
multimem.red.relaxed.gpu.global.xor.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B32* addr,
B32 val);
multimem.red.relaxed.sys.global.xor.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B32* addr,
B32 val);
multimem.red.release.cta.global.xor.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B32* addr,
B32 val);
multimem.red.release.cluster.global.xor.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B32* addr,
B32 val);
multimem.red.release.gpu.global.xor.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B32* addr,
B32 val);
multimem.red.release.sys.global.xor.b32
// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B32* addr,
B32 val);
multimem.red.relaxed.cta.global.and.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B64* addr,
B64 val);
multimem.red.relaxed.cluster.global.and.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B64* addr,
B64 val);
multimem.red.relaxed.gpu.global.and.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B64* addr,
B64 val);
multimem.red.relaxed.sys.global.and.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B64* addr,
B64 val);
multimem.red.release.cta.global.and.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B64* addr,
B64 val);
multimem.red.release.cluster.global.and.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B64* addr,
B64 val);
multimem.red.release.gpu.global.and.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B64* addr,
B64 val);
multimem.red.release.sys.global.and.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .and }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_and_op_t,
B64* addr,
B64 val);
multimem.red.relaxed.cta.global.or.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B64* addr,
B64 val);
multimem.red.relaxed.cluster.global.or.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B64* addr,
B64 val);
multimem.red.relaxed.gpu.global.or.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B64* addr,
B64 val);
multimem.red.relaxed.sys.global.or.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B64* addr,
B64 val);
multimem.red.release.cta.global.or.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B64* addr,
B64 val);
multimem.red.release.cluster.global.or.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B64* addr,
B64 val);
multimem.red.release.gpu.global.or.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B64* addr,
B64 val);
multimem.red.release.sys.global.or.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .or }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_or_op_t,
B64* addr,
B64 val);
multimem.red.relaxed.cta.global.xor.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B64* addr,
B64 val);
multimem.red.relaxed.cluster.global.xor.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B64* addr,
B64 val);
multimem.red.relaxed.gpu.global.xor.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B64* addr,
B64 val);
multimem.red.relaxed.sys.global.xor.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B64* addr,
B64 val);
multimem.red.release.cta.global.xor.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B64* addr,
B64 val);
multimem.red.release.cluster.global.xor.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B64* addr,
B64 val);
multimem.red.release.gpu.global.xor.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B64* addr,
B64 val);
multimem.red.release.sys.global.xor.b64
// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90
// .sem = { .relaxed, .release }
// .scope = { .cta, .cluster, .gpu, .sys }
// .op = { .xor }
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
__device__ static inline void multimem_red(
cuda::ptx::sem_t<Sem> sem,
cuda::ptx::scope_t<Scope> scope,
cuda::ptx::op_xor_op_t,
B64* addr,
B64 val);