tcgen05.ld
PTX ISA: tcgen05.ld
tcgen05.ld.sync.aligned.16x64b.x1.b32
// tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
B32 (&out)[1],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32
// tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
B32 (&out)[1],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x2.b32
// tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
B32 (&out)[2],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32
// tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
B32 (&out)[2],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x4.b32
// tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
B32 (&out)[4],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32
// tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
B32 (&out)[4],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x8.b32
// tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
B32 (&out)[8],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32
// tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
B32 (&out)[8],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x16.b32
// tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
B32 (&out)[16],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32
// tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
B32 (&out)[16],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x32.b32
// tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
B32 (&out)[32],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32
// tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
B32 (&out)[32],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x64.b32
// tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
B32 (&out)[64],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32
// tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
B32 (&out)[64],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x128.b32
// tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b(
B32 (&out)[128],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32
// tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x64b_pack_16b(
B32 (&out)[128],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x1.b32
// tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
B32 (&out)[2],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32
// tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
B32 (&out)[2],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x2.b32
// tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
B32 (&out)[4],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32
// tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
B32 (&out)[4],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x4.b32
// tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
B32 (&out)[8],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32
// tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
B32 (&out)[8],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x8.b32
// tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
B32 (&out)[16],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32
// tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
B32 (&out)[16],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x16.b32
// tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
B32 (&out)[32],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32
// tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
B32 (&out)[32],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x32.b32
// tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
B32 (&out)[64],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32
// tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
B32 (&out)[64],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x64.b32
// tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b(
B32 (&out)[128],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32
// tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x128b_pack_16b(
B32 (&out)[128],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x256b.x1.b32
// tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
B32 (&out)[4],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32
// tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
B32 (&out)[4],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x256b.x2.b32
// tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
B32 (&out)[8],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32
// tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
B32 (&out)[8],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x256b.x4.b32
// tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
B32 (&out)[16],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32
// tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
B32 (&out)[16],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x256b.x8.b32
// tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
B32 (&out)[32],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32
// tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
B32 (&out)[32],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x256b.x16.b32
// tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
B32 (&out)[64],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32
// tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
B32 (&out)[64],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x256b.x32.b32
// tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b(
B32 (&out)[128],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32
// tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_16x256b_pack_16b(
B32 (&out)[128],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x1.b32
// tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
B32 (&out)[1],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32
// tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
B32 (&out)[1],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x2.b32
// tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
B32 (&out)[2],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32
// tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
B32 (&out)[2],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x4.b32
// tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
B32 (&out)[4],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32
// tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
B32 (&out)[4],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x8.b32
// tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
B32 (&out)[8],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32
// tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
B32 (&out)[8],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x16.b32
// tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
B32 (&out)[16],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32
// tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
B32 (&out)[16],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x32.b32
// tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
B32 (&out)[32],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32
// tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
B32 (&out)[32],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x64.b32
// tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
B32 (&out)[64],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32
// tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
B32 (&out)[64],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x128.b32
// tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b(
B32 (&out)[128],
uint32_t taddr);
tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32
// tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_ld_32x32b_pack_16b(
B32 (&out)[128],
uint32_t taddr);
tcgen05.ld.sync.aligned.16x32bx2.x1.b32
// tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
B32 (&out)[1],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32
// tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
B32 (&out)[1],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x2.b32
// tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
B32 (&out)[2],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32
// tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
B32 (&out)[2],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x4.b32
// tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
B32 (&out)[4],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32
// tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
B32 (&out)[4],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x8.b32
// tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
B32 (&out)[8],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32
// tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
B32 (&out)[8],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x16.b32
// tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
B32 (&out)[16],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32
// tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
B32 (&out)[16],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x32.b32
// tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
B32 (&out)[32],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32
// tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
B32 (&out)[32],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x64.b32
// tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
B32 (&out)[64],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32
// tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
B32 (&out)[64],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x128.b32
// tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2(
B32 (&out)[128],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);
tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32
// tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
__device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
B32 (&out)[128],
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff);