tcgen05.ld

tcgen05.ld.sync.aligned.16x64b.x1.b32

// tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[1],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32

// tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[1],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x2.b32

// tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[2],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32

// tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[2],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x4.b32

// tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32

// tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x8.b32

// tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32

// tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x16.b32

// tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32

// tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x32.b32

// tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32

// tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x64.b32

// tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32

// tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x128.b32

// tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32

// tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x1.b32

// tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[2],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32

// tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[2],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x2.b32

// tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32

// tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x4.b32

// tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32

// tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x8.b32

// tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32

// tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x16.b32

// tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32

// tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x32.b32

// tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32

// tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x64.b32

// tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32

// tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x1.b32

// tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32

// tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x2.b32

// tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32

// tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x4.b32

// tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32

// tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x8.b32

// tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32

// tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x16.b32

// tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32

// tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x32.b32

// tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32

// tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x1.b32

// tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[1],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32

// tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[1],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x2.b32

// tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[2],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32

// tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[2],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x4.b32

// tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32

// tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[4],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x8.b32

// tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32

// tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[8],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x16.b32

// tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32

// tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[16],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x32.b32

// tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32

// tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[32],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x64.b32

// tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32

// tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[64],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x128.b32

// tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32

// tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
  B32 (&out)[128],
  uint32_t taddr);

tcgen05.ld.sync.aligned.16x32bx2.x1.b32

// tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[1],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32

// tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[1],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x2.b32

// tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[2],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32

// tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[2],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x4.b32

// tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[4],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32

// tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[4],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x8.b32

// tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[8],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32

// tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[8],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x16.b32

// tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[16],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32

// tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[16],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x32.b32

// tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[32],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32

// tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[32],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x64.b32

// tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[64],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32

// tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[64],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x128.b32

// tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
  B32 (&out)[128],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);

tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32

// tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
  B32 (&out)[128],
  uint32_t taddr,
  cuda::ptx::n32_t<N32> immHalfSplitoff);