tcgen05.mma.ws
PTX ISA: tcgen05.mma.ws
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill
// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use
// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard
// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill
// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use
// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard
// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill
// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use
// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard
// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill
// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use
// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse
// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint64_t a_desc,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d,
uint64_t zero_column_mask_desc);
tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);
tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard
// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
cuda::ptx::cta_group_1_t,
cuda::ptx::kind_t<Kind> kind,
uint32_t d_tmem,
uint32_t a_tmem,
uint64_t b_desc,
uint32_t idesc,
bool enable_input_d);