cp.reduce.async.bulk

Integer and floating point instructions

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .b32 }
// .op        = { .and }
template <typename B32>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_and_op_t,
  B32* dstMem,
  const B32* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .b32 }
// .op        = { .or }
template <typename B32>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_or_op_t,
  B32* dstMem,
  const B32* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .b32 }
// .op        = { .xor }
template <typename B32>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_xor_op_t,
  B32* dstMem,
  const B32* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .u32 }
// .op        = { .min }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_min_t,
  uint32_t* dstMem,
  const uint32_t* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .u32 }
// .op        = { .max }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_max_t,
  uint32_t* dstMem,
  const uint32_t* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .u32 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  uint32_t* dstMem,
  const uint32_t* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .u32 }
// .op        = { .inc }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_inc_t,
  uint32_t* dstMem,
  const uint32_t* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .u32 }
// .op        = { .dec }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_dec_t,
  uint32_t* dstMem,
  const uint32_t* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .s32 }
// .op        = { .min }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_min_t,
  int32_t* dstMem,
  const int32_t* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .s32 }
// .op        = { .max }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_max_t,
  int32_t* dstMem,
  const int32_t* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .s32 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  int32_t* dstMem,
  const int32_t* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .u64 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  uint64_t* dstMem,
  const uint64_t* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .s64 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  int64_t* dstMem,
  const int64_t* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .b32, .b64 }
// .op        = { .and }
template <typename Type>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_and_op_t,
  Type* dstMem,
  const Type* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .b32, .b64 }
// .op        = { .and }
template <typename Type>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_and_op_t,
  Type* dstMem,
  const Type* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .b32, .b64 }
// .op        = { .or }
template <typename Type>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_or_op_t,
  Type* dstMem,
  const Type* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .b32, .b64 }
// .op        = { .or }
template <typename Type>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_or_op_t,
  Type* dstMem,
  const Type* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .b32, .b64 }
// .op        = { .xor }
template <typename Type>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_xor_op_t,
  Type* dstMem,
  const Type* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .b32, .b64 }
// .op        = { .xor }
template <typename Type>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_xor_op_t,
  Type* dstMem,
  const Type* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .u32 }
// .op        = { .min }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_min_t,
  uint32_t* dstMem,
  const uint32_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .u32 }
// .op        = { .max }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_max_t,
  uint32_t* dstMem,
  const uint32_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .u32 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  uint32_t* dstMem,
  const uint32_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .u32 }
// .op        = { .inc }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_inc_t,
  uint32_t* dstMem,
  const uint32_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .u32 }
// .op        = { .dec }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_dec_t,
  uint32_t* dstMem,
  const uint32_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .s32 }
// .op        = { .min }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_min_t,
  int32_t* dstMem,
  const int32_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .s32 }
// .op        = { .max }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_max_t,
  int32_t* dstMem,
  const int32_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .s32 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  int32_t* dstMem,
  const int32_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .u64 }
// .op        = { .min }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_min_t,
  uint64_t* dstMem,
  const uint64_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .u64 }
// .op        = { .max }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_max_t,
  uint64_t* dstMem,
  const uint64_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .u64 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  uint64_t* dstMem,
  const uint64_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .s64 }
// .op        = { .min }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_min_t,
  int64_t* dstMem,
  const int64_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .s64 }
// .op        = { .max }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_max_t,
  int64_t* dstMem,
  const int64_t* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .f32 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  float* dstMem,
  const float* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .f64 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  double* dstMem,
  const double* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64

// cp.reduce.async.bulk.dst.src.bulk_group.op.u64  [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .s64 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  int64_t* dstMem,
  const int64_t* srcMem,
  uint32_t size);

Emulation of .s64 instruction

PTX does not currently (CTK 12.3) expose cp.reduce.async.bulk.add.s64. This exposure is emulated in cuda::ptx using:

// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90
// .dst       = { .shared::cluster }
// .src       = { .shared::cta }
// .type      = { .s64 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_cluster_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  int64_t* dstMem,
  const int64_t* srcMem,
  uint32_t size,
  uint64_t* rdsmem_bar);

// cp.reduce.async.bulk.dst.src.bulk_group.op.u64  [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .s64 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  int64_t* dstMem,
  const int64_t* srcMem,
  uint32_t size);

FP16 instructions

cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .f16 }
// .op        = { .min }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_min_t,
  __half* dstMem,
  const __half* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .f16 }
// .op        = { .max }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_max_t,
  __half* dstMem,
  const __half* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16

// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .f16 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  __half* dstMem,
  const __half* srcMem,
  uint32_t size);

BF16 instructions

cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .bf16 }
// .op        = { .min }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_min_t,
  __nv_bfloat16* dstMem,
  const __nv_bfloat16* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16

// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .bf16 }
// .op        = { .max }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_max_t,
  __nv_bfloat16* dstMem,
  const __nv_bfloat16* srcMem,
  uint32_t size);

cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16

// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
// .dst       = { .global }
// .src       = { .shared::cta }
// .type      = { .bf16 }
// .op        = { .add }
template <typename=void>
__device__ static inline void cp_reduce_async_bulk(
  cuda::ptx::space_global_t,
  cuda::ptx::space_shared_t,
  cuda::ptx::op_add_t,
  __nv_bfloat16* dstMem,
  const __nv_bfloat16* srcMem,
  uint32_t size);