tcgen05.st
PTX ISA: tcgen05.st
tcgen05.st.sync.aligned.16x64b.x1.b32
// tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b(
uint32_t taddr,
const B32 (&values)[1]);
tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32
// tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b_unpack_16b(
uint32_t taddr,
const B32 (&values)[1]);
tcgen05.st.sync.aligned.16x64b.x2.b32
// tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b(
uint32_t taddr,
const B32 (&values)[2]);
tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32
// tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b_unpack_16b(
uint32_t taddr,
const B32 (&values)[2]);
tcgen05.st.sync.aligned.16x64b.x4.b32
// tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b(
uint32_t taddr,
const B32 (&values)[4]);
tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32
// tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b_unpack_16b(
uint32_t taddr,
const B32 (&values)[4]);
tcgen05.st.sync.aligned.16x64b.x8.b32
// tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b(
uint32_t taddr,
const B32 (&values)[8]);
tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32
// tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b_unpack_16b(
uint32_t taddr,
const B32 (&values)[8]);
tcgen05.st.sync.aligned.16x64b.x16.b32
// tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b(
uint32_t taddr,
const B32 (&values)[16]);
tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32
// tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b_unpack_16b(
uint32_t taddr,
const B32 (&values)[16]);
tcgen05.st.sync.aligned.16x64b.x32.b32
// tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b(
uint32_t taddr,
const B32 (&values)[32]);
tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32
// tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b_unpack_16b(
uint32_t taddr,
const B32 (&values)[32]);
tcgen05.st.sync.aligned.16x64b.x64.b32
// tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b(
uint32_t taddr,
const B32 (&values)[64]);
tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32
// tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b_unpack_16b(
uint32_t taddr,
const B32 (&values)[64]);
tcgen05.st.sync.aligned.16x64b.x128.b32
// tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b(
uint32_t taddr,
const B32 (&values)[128]);
tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32
// tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x64b_unpack_16b(
uint32_t taddr,
const B32 (&values)[128]);
tcgen05.st.sync.aligned.16x128b.x1.b32
// tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b(
uint32_t taddr,
const B32 (&values)[2]);
tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32
// tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b_unpack_16b(
uint32_t taddr,
const B32 (&values)[2]);
tcgen05.st.sync.aligned.16x128b.x2.b32
// tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b(
uint32_t taddr,
const B32 (&values)[4]);
tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32
// tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b_unpack_16b(
uint32_t taddr,
const B32 (&values)[4]);
tcgen05.st.sync.aligned.16x128b.x4.b32
// tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b(
uint32_t taddr,
const B32 (&values)[8]);
tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32
// tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b_unpack_16b(
uint32_t taddr,
const B32 (&values)[8]);
tcgen05.st.sync.aligned.16x128b.x8.b32
// tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b(
uint32_t taddr,
const B32 (&values)[16]);
tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32
// tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b_unpack_16b(
uint32_t taddr,
const B32 (&values)[16]);
tcgen05.st.sync.aligned.16x128b.x16.b32
// tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b(
uint32_t taddr,
const B32 (&values)[32]);
tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32
// tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b_unpack_16b(
uint32_t taddr,
const B32 (&values)[32]);
tcgen05.st.sync.aligned.16x128b.x32.b32
// tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b(
uint32_t taddr,
const B32 (&values)[64]);
tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32
// tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b_unpack_16b(
uint32_t taddr,
const B32 (&values)[64]);
tcgen05.st.sync.aligned.16x128b.x64.b32
// tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b(
uint32_t taddr,
const B32 (&values)[128]);
tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32
// tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x128b_unpack_16b(
uint32_t taddr,
const B32 (&values)[128]);
tcgen05.st.sync.aligned.16x256b.x1.b32
// tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x256b(
uint32_t taddr,
const B32 (&values)[4]);
tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32
// tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x256b_unpack_16b(
uint32_t taddr,
const B32 (&values)[4]);
tcgen05.st.sync.aligned.16x256b.x2.b32
// tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x256b(
uint32_t taddr,
const B32 (&values)[8]);
tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32
// tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x256b_unpack_16b(
uint32_t taddr,
const B32 (&values)[8]);
tcgen05.st.sync.aligned.16x256b.x4.b32
// tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x256b(
uint32_t taddr,
const B32 (&values)[16]);
tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32
// tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x256b_unpack_16b(
uint32_t taddr,
const B32 (&values)[16]);
tcgen05.st.sync.aligned.16x256b.x8.b32
// tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x256b(
uint32_t taddr,
const B32 (&values)[32]);
tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32
// tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x256b_unpack_16b(
uint32_t taddr,
const B32 (&values)[32]);
tcgen05.st.sync.aligned.16x256b.x16.b32
// tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x256b(
uint32_t taddr,
const B32 (&values)[64]);
tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32
// tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x256b_unpack_16b(
uint32_t taddr,
const B32 (&values)[64]);
tcgen05.st.sync.aligned.16x256b.x32.b32
// tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x256b(
uint32_t taddr,
const B32 (&values)[128]);
tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32
// tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x256b_unpack_16b(
uint32_t taddr,
const B32 (&values)[128]);
tcgen05.st.sync.aligned.32x32b.x1.b32
// tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b(
uint32_t taddr,
const B32 (&values)[1]);
tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32
// tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b_unpack_16b(
uint32_t taddr,
const B32 (&values)[1]);
tcgen05.st.sync.aligned.32x32b.x2.b32
// tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b(
uint32_t taddr,
const B32 (&values)[2]);
tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32
// tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b_unpack_16b(
uint32_t taddr,
const B32 (&values)[2]);
tcgen05.st.sync.aligned.32x32b.x4.b32
// tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b(
uint32_t taddr,
const B32 (&values)[4]);
tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32
// tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b_unpack_16b(
uint32_t taddr,
const B32 (&values)[4]);
tcgen05.st.sync.aligned.32x32b.x8.b32
// tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b(
uint32_t taddr,
const B32 (&values)[8]);
tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32
// tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b_unpack_16b(
uint32_t taddr,
const B32 (&values)[8]);
tcgen05.st.sync.aligned.32x32b.x16.b32
// tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b(
uint32_t taddr,
const B32 (&values)[16]);
tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32
// tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b_unpack_16b(
uint32_t taddr,
const B32 (&values)[16]);
tcgen05.st.sync.aligned.32x32b.x32.b32
// tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b(
uint32_t taddr,
const B32 (&values)[32]);
tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32
// tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b_unpack_16b(
uint32_t taddr,
const B32 (&values)[32]);
tcgen05.st.sync.aligned.32x32b.x64.b32
// tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b(
uint32_t taddr,
const B32 (&values)[64]);
tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32
// tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b_unpack_16b(
uint32_t taddr,
const B32 (&values)[64]);
tcgen05.st.sync.aligned.32x32b.x128.b32
// tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b(
uint32_t taddr,
const B32 (&values)[128]);
tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32
// tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_32x32b_unpack_16b(
uint32_t taddr,
const B32 (&values)[128]);
tcgen05.st.sync.aligned.16x32bx2.x1.b32
// tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[1]);
tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32
// tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2_unpack_16b(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[1]);
tcgen05.st.sync.aligned.16x32bx2.x2.b32
// tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[2]);
tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32
// tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2_unpack_16b(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[2]);
tcgen05.st.sync.aligned.16x32bx2.x4.b32
// tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[4]);
tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32
// tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2_unpack_16b(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[4]);
tcgen05.st.sync.aligned.16x32bx2.x8.b32
// tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[8]);
tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32
// tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2_unpack_16b(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[8]);
tcgen05.st.sync.aligned.16x32bx2.x16.b32
// tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[16]);
tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32
// tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2_unpack_16b(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[16]);
tcgen05.st.sync.aligned.16x32bx2.x32.b32
// tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[32]);
tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32
// tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2_unpack_16b(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[32]);
tcgen05.st.sync.aligned.16x32bx2.x64.b32
// tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[64]);
tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32
// tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2_unpack_16b(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[64]);
tcgen05.st.sync.aligned.16x32bx2.x128.b32
// tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[128]);
tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32
// tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a
template <int N32, typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
__device__ static inline void tcgen05_st_16x32bx2_unpack_16b(
uint32_t taddr,
cuda::ptx::n32_t<N32> immHalfSplitoff,
const B32 (&values)[128]);