40 template <
int Interleaved,
typename Element,
typename Layout>
44 const int InstructionShapeCol = 8;
46 const int ElementsPerThread = InstructionShapeCol / 4;
48 const int ReorderedElementsPerThread =
51 for (
int n = 0; n < problem_size.
n(); n++) {
52 for (
int k = 0; k < problem_size.
k(); k++) {
53 dest.
at({k, (n / Interleaved) * Interleaved +
54 ((n % ReorderedElementsPerThread) / ElementsPerThread) *
56 ((n % Interleaved) / ReorderedElementsPerThread) *
58 (n % ElementsPerThread)}) = src.
at({k, n});
Definition: aligned_buffer.h:35
void reorder_column(TensorRef< Element, Layout > dest, TensorRef< Element, Layout > src, cutlass::gemm::GemmCoord problem_size)
Definition: host_reorder.h:41
A Coord is a coordinate of arbitrary rank into a tensor or matrix.
Definition: include/cutlass/gemm/gemm.h:94
CUTLASS_HOST_DEVICE Index const & n() const
Returns the GEMM N coordinate.
Definition: include/cutlass/gemm/gemm.h:137
Defines a structure containing strides and a pointer to tensor data.
CUTLASS_HOST_DEVICE Index const & k() const
Returns the GEMM K coordinate.
Definition: include/cutlass/gemm/gemm.h:145
CUTLASS_HOST_DEVICE Reference at(TensorCoord const &coord) const
Returns a reference to the element at a given Coord.
Definition: tensor_ref.h:307
HostTensor contributes management for both host and device memory.