CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
|
#include <default_gemm_configuration.h>
Public Types | |
using | ThreadblockShape = GemmShape< 128, 256, 128 > |
using | WarpShape = GemmShape< 64, 64, 128 > |
using | InstructionShape = GemmShape< 8, 8, 32 > |
using | EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< ElementC, 128/sizeof_bits< ElementC >::value, int32_t, float > |
using | Operator = arch::OpMultiplyAddSaturate |
Static Public Attributes | |
static int const | kAlignmentA = 128 / sizeof_bits<int4b_t>::value |
static int const | kAlignmentB = 128 / sizeof_bits<uint4b_t>::value |
static int const | kStages = 2 |
using cutlass::gemm::device::DefaultGemmConfiguration< arch::OpClassTensorOp, arch::Sm75, int4b_t, uint4b_t, ElementC, int32_t >::EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float> |
using cutlass::gemm::device::DefaultGemmConfiguration< arch::OpClassTensorOp, arch::Sm75, int4b_t, uint4b_t, ElementC, int32_t >::InstructionShape = GemmShape<8, 8, 32> |
using cutlass::gemm::device::DefaultGemmConfiguration< arch::OpClassTensorOp, arch::Sm75, int4b_t, uint4b_t, ElementC, int32_t >::Operator = arch::OpMultiplyAddSaturate |
using cutlass::gemm::device::DefaultGemmConfiguration< arch::OpClassTensorOp, arch::Sm75, int4b_t, uint4b_t, ElementC, int32_t >::ThreadblockShape = GemmShape<128, 256, 128> |
using cutlass::gemm::device::DefaultGemmConfiguration< arch::OpClassTensorOp, arch::Sm75, int4b_t, uint4b_t, ElementC, int32_t >::WarpShape = GemmShape<64, 64, 128> |
|
static |
|
static |
|
static |