37 typename TransposeShape,
42 template <
int ElementCount_>
43 struct Transpose<ElementCount_, layout::PitchLinearShape<4,4> , int8_t> {
45 static const int kElementCount = ElementCount_;
48 using Fragment = cutlass::Array<Element, kElementCount>;
50 static_assert(!(kElementCount % TransposeShape::kCount),
"Shape needs to be multiple of 16 elements to do a 4x4 transpose");
56 int* src_int =
reinterpret_cast<int*
>(&src);
57 int* dst_int =
reinterpret_cast<int*
>(&dst);
60 for (
int i = 0; i < kElementCount / TransposeShape::kCount; i++){
62 int const i0 = 4 * i + 0;
63 int const i1 = 4 * i + 1;
64 int const i2 = 4 * i + 2;
65 int const i3 = 4 * i + 3;
72 int b0, b1, b2, b3, c0;
73 asm volatile(
"prmt.b32 %0, %1, %2, 0x0040;" :
"=r"(b0) :
"r"(a0),
"r"(a1));
74 asm volatile(
"prmt.b32 %0, %1, %2, 0x0040;" :
"=r"(c0) :
"r"(a2),
"r"(a3));
75 asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" :
"=r"(b0) :
"r"(b0),
"r"(c0));
77 asm volatile(
"prmt.b32 %0, %1, %2, 0x0051;" :
"=r"(b1) :
"r"(a0),
"r"(a1));
78 asm volatile(
"prmt.b32 %0, %1, %2, 0x0051;" :
"=r"(c0) :
"r"(a2),
"r"(a3));
79 asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" :
"=r"(b1) :
"r"(b1),
"r"(c0));
81 asm volatile(
"prmt.b32 %0, %1, %2, 0x0062;" :
"=r"(b2) :
"r"(a0),
"r"(a1));
82 asm volatile(
"prmt.b32 %0, %1, %2, 0x0062;" :
"=r"(c0) :
"r"(a2),
"r"(a3));
83 asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" :
"=r"(b2) :
"r"(b2),
"r"(c0));
85 asm volatile(
"prmt.b32 %0, %1, %2, 0x0073;" :
"=r"(b3) :
"r"(a0),
"r"(a1));
86 asm volatile(
"prmt.b32 %0, %1, %2, 0x0073;" :
"=r"(c0) :
"r"(a2),
"r"(a3));
87 asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" :
"=r"(b3) :
"r"(b3),
"r"(c0));
Definition: aligned_buffer.h:35
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110