tcgen05.ld#

tcgen05.ld.sync.aligned.16x64b.x1.b32#

// tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[1],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[1],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x2.b32#

// tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[2],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[2],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x4.b32#

// tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x8.b32#

// tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x16.b32#

// tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x32.b32#

// tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x64.b32#

// tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x128.b32#

// tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x1.b32#

// tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[2],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[2],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x2.b32#

// tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x4.b32#

// tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x8.b32#

// tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x16.b32#

// tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x32.b32#

// tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x64.b32#

// tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x1.b32#

// tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x2.b32#

// tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x4.b32#

// tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x8.b32#

// tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x16.b32#

// tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x32.b32#

// tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x1.b32#

// tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[1],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32#

// tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[1],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x2.b32#

// tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[2],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32#

// tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[2],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x4.b32#

// tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32#

// tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x8.b32#

// tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32#

// tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x16.b32#

// tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32#

// tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x32.b32#

// tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32#

// tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x64.b32#

// tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32#

// tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x128.b32#

// tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32#

// tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x32bx2.x1.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[1],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[1],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x2.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[2],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[2],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x4.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[4],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[4],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x8.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[8],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[8],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x16.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[16],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[16],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x32.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[32],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[32],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x64.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[64],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[64],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x128.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[128],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32#

// tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[128],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.32x32b.x2.u32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x2.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[2],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x2.u32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x2.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[2],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x2.s32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x2.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[2],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x2.s32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x2.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[2],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x2.f32.min.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x2.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[2],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x2.f32.max.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x2.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[2],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x2.f32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x2.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[2],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x2.f32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x2.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[2],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x4.u32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x4.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[4],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x4.u32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x4.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[4],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x4.s32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x4.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[4],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x4.s32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x4.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[4],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x4.f32.min.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x4.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[4],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x4.f32.max.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x4.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[4],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x4.f32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x4.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[4],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x4.f32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x4.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[4],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x8.u32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x8.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[8],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x8.u32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x8.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[8],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x8.s32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x8.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[8],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x8.s32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x8.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[8],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x8.f32.min.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x8.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[8],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x8.f32.max.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x8.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[8],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x8.f32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x8.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[8],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x8.f32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x8.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[8],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x16.u32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x16.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[16],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x16.u32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x16.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[16],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x16.s32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x16.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[16],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x16.s32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x16.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[16],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x16.f32.min.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x16.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[16],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x16.f32.max.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x16.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[16],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x16.f32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x16.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[16],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x16.f32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x16.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[16],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x32.u32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x32.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[32],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x32.u32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x32.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[32],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x32.s32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x32.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[32],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x32.s32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x32.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[32],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x32.f32.min.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x32.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[32],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x32.f32.max.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x32.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[32],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x32.f32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x32.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[32],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x32.f32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x32.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[32],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x64.u32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x64.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[64],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x64.u32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x64.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[64],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x64.s32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x64.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[64],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x64.s32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x64.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[64],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x64.f32.min.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x64.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[64],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x64.f32.max.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x64.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[64],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x64.f32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x64.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[64],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x64.f32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x64.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[64],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x128.u32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x128.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[128],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x128.u32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x128.u32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[128],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x128.s32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x128.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[128],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x128.s32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x128.s32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[128],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x128.f32.min.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x128.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[128],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x128.f32.max.abs#

// tcgen05.ld.red.sync.aligned.32x32b.x128.f32.op.abs out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[128],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x128.f32.min#

// tcgen05.ld.red.sync.aligned.32x32b.x128.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[128],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.32x32b.x128.f32.max#

// tcgen05.ld.red.sync.aligned.32x32b.x128.f32.op out, redval, [taddr]; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_32x32b(
  cuda::ptx::op_t<Op> op,
  float (&out)[128],
  uint32_t taddr);

tcgen05.ld.red.sync.aligned.16x32bx2.x2.u32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x2.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[2],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x2.u32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x2.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[2],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x2.s32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x2.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[2],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x2.s32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x2.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[2],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x2.f32.min.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x2.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[2],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x2.f32.max.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x2.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[2],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x2.f32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x2.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[2],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x2.f32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x2.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[2],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x4.u32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x4.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[4],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x4.u32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x4.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[4],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x4.s32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x4.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[4],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x4.s32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x4.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[4],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x4.f32.min.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x4.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[4],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x4.f32.max.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x4.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[4],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x4.f32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x4.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[4],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x4.f32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x4.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[4],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x8.u32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x8.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[8],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x8.u32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x8.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[8],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x8.s32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x8.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[8],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x8.s32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x8.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[8],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x8.f32.min.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x8.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[8],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x8.f32.max.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x8.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[8],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x8.f32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x8.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[8],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x8.f32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x8.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[8],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x16.u32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x16.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[16],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x16.u32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x16.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[16],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x16.s32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x16.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[16],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x16.s32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x16.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[16],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x16.f32.min.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x16.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[16],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x16.f32.max.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x16.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[16],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x16.f32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x16.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[16],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x16.f32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x16.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[16],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x32.u32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x32.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[32],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x32.u32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x32.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[32],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x32.s32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x32.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[32],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x32.s32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x32.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[32],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x32.f32.min.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x32.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[32],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x32.f32.max.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x32.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[32],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x32.f32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x32.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[32],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x32.f32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x32.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[32],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x64.u32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x64.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[64],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x64.u32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x64.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[64],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x64.s32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x64.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[64],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x64.s32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x64.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[64],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x64.f32.min.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x64.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[64],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x64.f32.max.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x64.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[64],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x64.f32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x64.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[64],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x64.f32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x64.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[64],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x128.u32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x128.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[128],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x128.u32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x128.u32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline uint32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  uint32_t (&out)[128],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x128.s32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x128.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[128],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x128.s32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x128.s32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline int32_t tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  int32_t (&out)[128],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x128.f32.min.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x128.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[128],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x128.f32.max.abs#

// tcgen05.ld.red.sync.aligned.16x32bx2.x128.f32.op.abs out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2_abs(
  cuda::ptx::op_t<Op> op,
  float (&out)[128],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x128.f32.min#

// tcgen05.ld.red.sync.aligned.16x32bx2.x128.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[128],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.red.sync.aligned.16x32bx2.x128.f32.max#

// tcgen05.ld.red.sync.aligned.16x32bx2.x128.f32.op out, redval, [taddr], immHalfSplitoff; // PTX ISA 88, SM_103a, SM_103f, SM_110a, SM_110f
// .op        = { .min, .max }
template <int N32, cuda::ptx::dot_op Op>
__device__ static inline float tcgen05_ld_red_16x32bx2(
  cuda::ptx::op_t<Op> op,
  float (&out)[128],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);