tcgen05.mma.ws

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill

// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use

// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard

// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill

// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use

// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard

// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill

// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use

// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard

// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill

// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use

// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse

// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint64_t a_desc,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d,
  uint64_t zero_column_mask_desc);

tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);

tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard

// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
// .cta_group = { .cta_group::1 }
// .kind      = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
template <cuda::ptx::dot_kind Kind>
__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard(
  cuda::ptx::cta_group_1_t,
  cuda::ptx::kind_t<Kind> kind,
  uint32_t d_tmem,
  uint32_t a_tmem,
  uint64_t b_desc,
  uint32_t idesc,
  bool enable_input_d);