CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
Namespaces | Classes | Typedefs | Enumerations | Functions
cutlass Namespace Reference

Namespaces

 arch
 
 debug
 
 detail
 
 device_memory
 
 epilogue
 
 gemm
 
 layout
 
 library
 
 platform
 
 reduction
 
 reference
 
 thread
 
 transform
 

Classes

class  AlignedArray
 Aligned array type. More...
 
struct  AlignedBuffer
 Modifies semantics of cutlass::Array<> to provide guaranteed alignment. More...
 
class  Array< T, N, false >
 Statically sized array for any data type. More...
 
class  Array< T, N, true >
 Statically sized array for any data type. More...
 
struct  CommandLine
 
class  complex
 
class  ConstSubbyteReference
 
struct  Coord
 Statically-sized array specifying Coords within a tensor. More...
 
class  cuda_exception
 C++ exception wrapper for CUDA cudaError_t. More...
 
struct  Distribution
 Distribution type. More...
 
struct  divide_assert
 
struct  divides
 
struct  divides< Array< half_t, N > >
 
struct  divides< Array< T, N > >
 
struct  FloatType
 Defines a floating-point type based on the number of exponent and mantissa bits. More...
 
struct  FloatType< 11, 52 >
 
struct  FloatType< 5, 10 >
 
struct  FloatType< 8, 23 >
 
struct  half_t
 IEEE half-precision floating-point type. More...
 
class  HostTensor
 Host tensor. More...
 
class  IdentityTensorLayout
 
struct  integer_subbyte
 4-bit signed integer type More...
 
struct  IntegerType
 Defines integers based on size and whether they are signed. More...
 
struct  IntegerType< 1, false >
 
struct  IntegerType< 1, true >
 
struct  IntegerType< 16, false >
 
struct  IntegerType< 16, true >
 
struct  IntegerType< 32, false >
 
struct  IntegerType< 32, true >
 
struct  IntegerType< 4, false >
 
struct  IntegerType< 4, true >
 
struct  IntegerType< 64, false >
 
struct  IntegerType< 64, true >
 
struct  IntegerType< 8, false >
 
struct  IntegerType< 8, true >
 
struct  is_pow2
 
struct  KernelLaunchConfiguration
 Structure containing the basic launch configuration of a CUDA kernel. More...
 
struct  log2_down
 
struct  log2_down< N, 1, Count >
 
struct  log2_up
 
struct  log2_up< N, 1, Count >
 
struct  MatrixCoord
 
struct  MatrixShape
 Describes the size of a matrix tile. More...
 
struct  Max
 
struct  maximum
 
struct  maximum< Array< T, N > >
 
struct  maximum< float >
 
struct  Min
 
struct  minimum
 
struct  minimum< Array< T, N > >
 
struct  minimum< float >
 
struct  minus
 
struct  minus< Array< half_t, N > >
 
struct  minus< Array< T, N > >
 
struct  multiplies
 
struct  multiplies< Array< half_t, N > >
 
struct  multiplies< Array< T, N > >
 
struct  multiply_add
 Fused multiply-add. More...
 
struct  multiply_add< Array< half_t, N >, Array< half_t, N >, Array< half_t, N > >
 Fused multiply-add. More...
 
struct  multiply_add< Array< T, N >, Array< T, N >, Array< T, N > >
 Fused multiply-add. More...
 
struct  multiply_add< complex< T >, complex< T >, complex< T > >
 Fused multiply-add. More...
 
struct  multiply_add< complex< T >, T, complex< T > >
 Fused multiply-add. More...
 
struct  multiply_add< T, complex< T >, complex< T > >
 Fused multiply-add. More...
 
struct  negate
 
struct  negate< Array< half_t, N > >
 
struct  negate< Array< T, N > >
 
struct  NumericArrayConverter
 Conversion operator for Array. More...
 
struct  NumericArrayConverter< float, half_t, 2, Round >
 Partial specialization for Array<float, 2> <= Array<half_t, 2>, round to nearest. More...
 
struct  NumericArrayConverter< float, half_t, N, Round >
 Partial specialization for Array<half> <= Array<float> More...
 
struct  NumericArrayConverter< half_t, float, 2, FloatRoundStyle::round_to_nearest >
 Partial specialization for Array<half, 2> <= Array<float, 2>, round to nearest. More...
 
struct  NumericArrayConverter< half_t, float, N, Round >
 Partial specialization for Array<half> <= Array<float> More...
 
struct  NumericConverter
 
struct  NumericConverter< float, half_t, Round >
 Partial specialization for float <= half_t. More...
 
struct  NumericConverter< half_t, float, FloatRoundStyle::round_to_nearest >
 Specialization for round-to-nearest. More...
 
struct  NumericConverter< half_t, float, FloatRoundStyle::round_toward_zero >
 Specialization for round-toward-zero. More...
 
struct  NumericConverter< int8_t, float, Round >
 
struct  NumericConverter< T, T, Round >
 Partial specialization for float <= half_t. More...
 
struct  NumericConverterClamp
 
struct  plus
 
struct  plus< Array< half_t, N > >
 
struct  plus< Array< T, N > >
 
struct  PredicateVector
 Statically sized array of bits implementing. More...
 
struct  RealType
 Used to determine the real-valued underlying type of a numeric type T. More...
 
struct  RealType< complex< T > >
 Partial specialization for complex-valued type. More...
 
struct  ReferenceFactory
 
struct  ReferenceFactory< Element, false >
 
struct  ReferenceFactory< Element, true >
 
struct  ScalarIO
 Helper to enable formatted printing of CUTLASS scalar types to an ostream. More...
 
class  Semaphore
 CTA-wide semaphore for inter-CTA synchronization. More...
 
struct  sizeof_bits
 Defines the size of an element in bits. More...
 
struct  sizeof_bits< Array< T, N, RegisterSized > >
 Statically sized array for any data type. More...
 
struct  sizeof_bits< bin1_t >
 Defines the size of an element in bits - specialized for bin1_t. More...
 
struct  sizeof_bits< int4b_t >
 Defines the size of an element in bits - specialized for int4b_t. More...
 
struct  sizeof_bits< uint1b_t >
 Defines the size of an element in bits - specialized for uint1b_t. More...
 
struct  sizeof_bits< uint4b_t >
 Defines the size of an element in bits - specialized for uint4b_t. More...
 
struct  sqrt_est
 
class  SubbyteReference
 
struct  Tensor4DCoord
 Defines a canonical 4D coordinate used by tensor operations. More...
 
class  TensorRef
 
class  TensorView
 
struct  TypeTraits
 
struct  TypeTraits< complex< double > >
 
struct  TypeTraits< complex< float > >
 
struct  TypeTraits< complex< half > >
 
struct  TypeTraits< complex< half_t > >
 
struct  TypeTraits< double >
 
struct  TypeTraits< float >
 
struct  TypeTraits< half_t >
 
struct  TypeTraits< int >
 
struct  TypeTraits< int64_t >
 
struct  TypeTraits< int8_t >
 
struct  TypeTraits< uint64_t >
 
struct  TypeTraits< uint8_t >
 
struct  TypeTraits< unsigned >
 
struct  xor_add
 Fused multiply-add. More...
 

Typedefs

using uint1b_t = integer_subbyte< 1, false >
 1-bit Unsigned integer type More...
 
using int4b_t = integer_subbyte< 4, true >
 4-bit Integer type More...
 
using uint4b_t = integer_subbyte< 4, false >
 4-bit Unsigned integer type More...
 
using bin1_t = bool
 1-bit binary type More...
 

Enumerations

enum  ComplexTransform { ComplexTransform::kNone, ComplexTransform::kConjugate }
 Enumeraed type describing a transformation on a complex value. More...
 
enum  Status {
  Status::kSuccess, Status::kErrorMisalignedOperand, Status::kErrorInvalidLayout, Status::kErrorInvalidProblem,
  Status::kErrorNotSupported, Status::kErrorWorkspaceNull, Status::kErrorInternal, Status::kInvalid
}
 Status code returned by CUTLASS operations. More...
 
enum  MatrixLayout { MatrixLayout::kColumnMajor, MatrixLayout::kRowMajor }
 
enum  MatrixTransform { MatrixTransform::kNone, MatrixTransform::kTranspose, MatrixTransform::kConjugate, MatrixTransform::kHermitian }
 Transformation applied to matrix operands. More...
 
enum  FloatRoundStyle {
  FloatRoundStyle::round_indeterminate, FloatRoundStyle::round_toward_zero, FloatRoundStyle::round_to_nearest, FloatRoundStyle::round_toward_infinity,
  FloatRoundStyle::round_toward_neg_infinity, FloatRoundStyle::round_half_ulp_truncate
}
 

Functions

CUTLASS_HOST_DEVICE constexpr bool ispow2 (unsigned x)
 Returns true if the argument is a power of 2. More...
 
CUTLASS_HOST_DEVICE constexpr unsigned floor_pow_2 (unsigned x)
 Returns the largest power of two not greater than the argument. More...
 
CUTLASS_HOST_DEVICE float const & real (cuFloatComplex const &z)
 Returns the real part of the complex number. More...
 
CUTLASS_HOST_DEVICE float & real (cuFloatComplex &z)
 Returns the real part of the complex number. More...
 
CUTLASS_HOST_DEVICE double const & real (cuDoubleComplex const &z)
 Returns the real part of the complex number. More...
 
CUTLASS_HOST_DEVICE double & real (cuDoubleComplex &z)
 Returns the real part of the complex number. More...
 
CUTLASS_HOST_DEVICE float const & imag (cuFloatComplex const &z)
 Returns the imaginary part of the complex number. More...
 
CUTLASS_HOST_DEVICE float & imag (cuFloatComplex &z)
 Returns the imaginary part of the complex number. More...
 
CUTLASS_HOST_DEVICE double const & imag (cuDoubleComplex const &z)
 Returns the imaginary part of the complex number. More...
 
CUTLASS_HOST_DEVICE double & imag (cuDoubleComplex &z)
 Returns the imaginary part of the complex number. More...
 
template<typename T >
CUTLASS_HOST_DEVICE T const & real (complex< T > const &z)
 Returns the real part of the complex number. More...
 
template<typename T >
CUTLASS_HOST_DEVICE T & real (complex< T > &z)
 Returns the real part of the complex number. More...
 
template<typename T >
CUTLASS_HOST_DEVICE T const & imag (complex< T > const &z)
 Returns the imaginary part of the complex number. More...
 
template<typename T >
CUTLASS_HOST_DEVICE T & imag (complex< T > &z)
 Returns the imaginary part of the complex number. More...
 
template<typename T >
std::ostream & operator<< (std::ostream &out, complex< T > const &z)
 
template<typename T >
CUTLASS_HOST_DEVICEabs (complex< T > const &z)
 Returns the magnitude of the complex number. More...
 
template<typename T >
CUTLASS_HOST_DEVICEarg (complex< T > const &z)
 Returns the magnitude of the complex number. More...
 
template<typename T >
CUTLASS_HOST_DEVICEnorm (T const &z)
 Returns the squared magnitude of a real number. More...
 
template<>
CUTLASS_HOST_DEVICE int8_t norm (int8_t const &z)
 Returns the squared magnitude of a real number. More...
 
template<typename T >
CUTLASS_HOST_DEVICE double norm (complex< T > const &z)
 Returns the squared magnitude of a complex number. More...
 
template<typename T , typename R >
CUTLASS_HOST_DEVICEnorm_accumulate (T const &x, R const &accumulator)
 Norm-accumulate calculation. More...
 
template<typename T , typename R >
CUTLASS_HOST_DEVICEnorm_accumulate (complex< T > const &z, R const &accumulator)
 Norm accumulate specialized for complex types. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > conj (complex< T > const &z)
 Returns the complex conjugate. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > proj (complex< T > const &z)
 Projects the complex number z onto the Riemann sphere. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > polar (T const &r, T const &theta=T())
 Returns a complex number with magnitude r and phase theta. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > exp (complex< T > const &z)
 Computes the complex exponential of z. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > log (complex< T > const &z)
 Computes the complex exponential of z. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > log10 (complex< T > const &z)
 Computes the complex exponential of z. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > sqrt (complex< T > const &z)
 Computes the square root of complex number z. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cos (complex< T > const &z)
 Computes the cosine of complex z. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > sin (complex< T > const &z)
 Computes the sin of complex z. More...
 
template<>
CUTLASS_HOST_DEVICE cutlass::complex< half_tfrom_real< cutlass::complex< half_t > > (double r)
 
template<>
CUTLASS_HOST_DEVICE cutlass::complex< float > from_real< cutlass::complex< float > > (double r)
 
template<>
CUTLASS_HOST_DEVICE cutlass::complex< double > from_real< cutlass::complex< double > > (double r)
 
template<int Rank, typename Index >
CUTLASS_HOST_DEVICE Coord< Rank, Index > operator/ (Index s, Coord< Rank, Index > coord)
 Scalar division. More...
 
template<int Rank, typename Index >
CUTLASS_HOST_DEVICE Coord< Rank, Index > operator/ (Coord< Rank, Index > coord, Index s)
 Scalar division. More...
 
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord (int _0)
 Helper to make a 2-element coordinate. More...
 
CUTLASS_HOST_DEVICE Coord< 2 > make_Coord (int _0, int _1)
 Helper to make a 2-element coordinate. More...
 
CUTLASS_HOST_DEVICE Coord< 3 > make_Coord (int _0, int _1, int _2)
 Helper to make a 3-element coordinate. More...
 
CUTLASS_HOST_DEVICE Coord< 4 > make_Coord (int _0, int _1, int _2, int _3)
 Helper to make a 4-element coordinate. More...
 
template<int Rank>
std::ostream & operator<< (std::ostream &out, Coord< Rank > const &coord)
 
std::istream & operator>> (std::istream &stream, half_t &x)
 
std::ostream & operator<< (std::ostream &out, half_t const &x)
 
template<typename T >
std::ostream & operator<< (std::ostream &out, ScalarIO< T > const &scalar)
 Default printing to ostream. More...
 
template<>
std::ostream & operator<< (std::ostream &out, ScalarIO< int8_t > const &scalar)
 Printing to ostream of int8_t as integer rather than character. More...
 
template<>
std::ostream & operator<< (std::ostream &out, ScalarIO< uint8_t > const &scalar)
 Printing to ostream of uint8_t as integer rather than character. More...
 
template<typename Operator >
__global__ void Kernel (typename Operator::Params params)
 Generic CUTLASS kernel template. More...
 
template<typename dividend_t , typename divisor_t >
CUTLASS_HOST_DEVICE dividend_t round_nearest (dividend_t dividend, divisor_t divisor)
 
template<typename value_t >
CUTLASS_HOST_DEVICE value_t gcd (value_t a, value_t b)
 
template<typename value_t >
CUTLASS_HOST_DEVICE value_t lcm (value_t a, value_t b)
 
template<typename value_t >
CUTLASS_HOST_DEVICE value_t clz (value_t x)
 
template<typename value_t >
CUTLASS_HOST_DEVICE value_t find_log2 (value_t x)
 
CUTLASS_HOST_DEVICE void find_divisor (unsigned int &mul, unsigned int &shr, unsigned int denom)
 
CUTLASS_HOST_DEVICE void fast_divmod (int &quo, int &rem, int src, int div, unsigned int mul, unsigned int shr)
 
CUTLASS_HOST_DEVICE void fast_divmod (int &quo, int64_t &rem, int64_t src, int div, unsigned int mul, unsigned int shr)
 
CUTLASS_HOST_DEVICE constexpr int const_min (int a, int b)
 
CUTLASS_HOST_DEVICE constexpr int const_max (int a, int b)
 
CUTLASS_HOST_DEVICE bool signbit (cutlass::half_t const &h)
 
CUTLASS_HOST_DEVICE cutlass::half_t abs (cutlass::half_t const &h)
 
CUTLASS_HOST_DEVICE bool isnan (cutlass::half_t const &h)
 
CUTLASS_HOST_DEVICE bool isfinite (cutlass::half_t const &h)
 
CUTLASS_HOST_DEVICE cutlass::half_t nanh (const char *)
 
CUTLASS_HOST_DEVICE bool isinf (cutlass::half_t const &h)
 
CUTLASS_HOST_DEVICE bool isnormal (cutlass::half_t const &h)
 
CUTLASS_HOST_DEVICE int fpclassify (cutlass::half_t const &h)
 
CUTLASS_HOST_DEVICE cutlass::half_t sqrt (cutlass::half_t const &h)
 
CUTLASS_HOST_DEVICE half_t copysign (half_t const &a, half_t const &b)
 
CUTLASS_HOST_DEVICE bool operator== (half_t const &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE bool operator!= (half_t const &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE bool operator< (half_t const &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE bool operator<= (half_t const &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE bool operator> (half_t const &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE bool operator>= (half_t const &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE half_t operator+ (half_t const &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE half_t operator- (half_t const &lhs)
 
CUTLASS_HOST_DEVICE half_t operator- (half_t const &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE half_t operator* (half_t const &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE half_t operator/ (half_t const &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE half_toperator+= (half_t &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE half_toperator-= (half_t &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE half_toperator*= (half_t &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE half_toperator/= (half_t &lhs, half_t const &rhs)
 
CUTLASS_HOST_DEVICE half_toperator++ (half_t &lhs)
 
CUTLASS_HOST_DEVICE half_toperator-- (half_t &lhs)
 
CUTLASS_HOST_DEVICE half_t operator++ (half_t &lhs, int)
 
CUTLASS_HOST_DEVICE half_t operator-- (half_t &lhs, int)
 
template<typename T >
CUTLASS_HOST_DEVICE bool relatively_equal (T a, T b, T epsilon, T nonzero_floor)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< uint1b_t > (uint1b_t a, uint1b_t b, uint1b_t, uint1b_t)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< int4b_t > (int4b_t a, int4b_t b, int4b_t, int4b_t)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< uint4b_t > (uint4b_t a, uint4b_t b, uint4b_t, uint4b_t)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< int8_t > (int8_t a, int8_t b, int8_t, int8_t)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< uint8_t > (uint8_t a, uint8_t b, uint8_t, uint8_t)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< int16_t > (int16_t a, int16_t b, int16_t, int16_t)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< uint16_t > (uint16_t a, uint16_t b, uint16_t, uint16_t)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< int32_t > (int32_t a, int32_t b, int32_t, int32_t)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< uint32_t > (uint32_t a, uint32_t b, uint32_t, uint32_t)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< int64_t > (int64_t a, int64_t b, int64_t, int64_t)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< uint64_t > (uint64_t a, uint64_t b, uint64_t, uint64_t)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< half_t > (half_t a, half_t b, half_t epsilon, half_t nonzero_floor)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< float > (float a, float b, float epsilon, float nonzero_floor)
 
template<>
CUTLASS_HOST_DEVICE bool relatively_equal< double > (double a, double b, double epsilon, double nonzero_floor)
 
template<typename Element , typename Layout >
CUTLASS_HOST_DEVICE TensorRef< Element, Layout > make_TensorRef (Element *ptr, Layout const &layout)
 Constructs a TensorRef, deducing types from arguments. More...
 
template<typename Element , typename Layout >
bool TensorRef_aligned (TensorRef< Element, Layout > const &ref, int alignment)
 
template<typename Element , typename Layout >
CUTLASS_HOST_DEVICE TensorView< Element, Layout > make_TensorView (Element *ptr, Layout const &layout, typename Layout::TensorCoord const &extent)
 Constructs a TensorRef, deducing types from arguments. More...
 
__host__ CUTLASS_DEVICE cudaError_t cuda_perror_impl (cudaError_t error, const char *filename, int line)
 The corresponding error message is printed to stderr (or stdout in device code) along with the supplied source context. More...
 
std::ostream & operator<< (std::ostream &out, cudaError_t result)
 Writes a cudaError_t to an output stream. More...
 
std::ostream & operator<< (std::ostream &out, cuda_exception const &e)
 Writes a cuda_exception instance to an output stream. More...
 
template<int Interleaved, typename Element , typename Layout >
void reorder_column (TensorRef< Element, Layout > dest, TensorRef< Element, Layout > src, cutlass::gemm::GemmCoord problem_size)
 
template<typename Element , typename Layout >
std::ostream & TensorViewWrite (std::ostream &out, TensorView< Element, Layout > const &view)
 Prints human-readable representation of a TensorView to an ostream. More...
 
template<typename Element , typename Layout >
std::ostream & operator<< (std::ostream &out, TensorView< Element, Layout > const &view)
 Prints human-readable representation of a TensorView to an ostream. More...
 

Typedef Documentation

using cutlass::bin1_t = typedef bool
using cutlass::int4b_t = typedef integer_subbyte<4, true>
using cutlass::uint1b_t = typedef integer_subbyte<1, false>
using cutlass::uint4b_t = typedef integer_subbyte<4, false>

Enumeration Type Documentation

Enumerator
kNone 
kConjugate 

Floating-point rounding style similare to Standard Library's formats but supporting additional rounding options.

Enumerator
round_indeterminate 

rounding mode unknown

round_toward_zero 

round toward zero

round_to_nearest 

round to nearest even

round_toward_infinity 

round toward infinity

round_toward_neg_infinity 

round toward negative infinity

round_half_ulp_truncate 

add 0.5ulp to integer representation then round toward zero

enum cutlass::MatrixLayout
strong
Enumerator
kColumnMajor 
kRowMajor 
Enumerator
kNone 
kTranspose 

no operation

kConjugate 

transpose operation

kHermitian 

conjugate

conjugate transpose

enum cutlass::Status
strong
Enumerator
kSuccess 

Operation was successful.

kErrorMisalignedOperand 

operands fail alignment requirements.

kErrorInvalidLayout 

Layout fails alignment requirement.

kErrorInvalidProblem 

Specified problem size is not supported by operator.

kErrorNotSupported 

Operation is not supported on current device.

kErrorWorkspaceNull 

The given workspace is null when it is required to be non-null.

kErrorInternal 

An error within CUTLASS occurred.

kInvalid 

Status is unspecified.

Function Documentation

template<typename T >
CUTLASS_HOST_DEVICE T cutlass::abs ( complex< T > const &  z)
template<typename T >
CUTLASS_HOST_DEVICE T cutlass::arg ( complex< T > const &  z)
template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::clz ( value_t  x)

log2 computation, what's the difference between the below codes and log2_up/down codes?

template<typename T >
CUTLASS_HOST_DEVICE complex<T> cutlass::conj ( complex< T > const &  z)
CUTLASS_HOST_DEVICE constexpr int cutlass::const_max ( int  a,
int  b 
)
CUTLASS_HOST_DEVICE constexpr int cutlass::const_min ( int  a,
int  b 
)
CUTLASS_HOST_DEVICE half_t cutlass::copysign ( half_t const &  a,
half_t const &  b 
)
template<typename T >
CUTLASS_HOST_DEVICE complex<T> cutlass::cos ( complex< T > const &  z)
__host__ CUTLASS_DEVICE cudaError_t cutlass::cuda_perror_impl ( cudaError_t  error,
const char *  filename,
int  line 
)
Returns
The CUDA error.
template<typename T >
CUTLASS_HOST_DEVICE complex<T> cutlass::exp ( complex< T > const &  z)
CUTLASS_HOST_DEVICE void cutlass::fast_divmod ( int &  quo,
int &  rem,
int  src,
int  div,
unsigned int  mul,
unsigned int  shr 
)

Find quotient and remainder using device-side intrinsics

CUTLASS_HOST_DEVICE void cutlass::fast_divmod ( int &  quo,
int64_t &  rem,
int64_t  src,
int  div,
unsigned int  mul,
unsigned int  shr 
)
CUTLASS_HOST_DEVICE void cutlass::find_divisor ( unsigned int &  mul,
unsigned int &  shr,
unsigned int  denom 
)

Find divisor, using find_log2

template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::find_log2 ( value_t  x)
CUTLASS_HOST_DEVICE constexpr unsigned cutlass::floor_pow_2 ( unsigned  x)
CUTLASS_HOST_DEVICE int cutlass::fpclassify ( cutlass::half_t const &  h)
template<>
CUTLASS_HOST_DEVICE cutlass::complex<double> cutlass::from_real< cutlass::complex< double > > ( double  r)
template<>
CUTLASS_HOST_DEVICE cutlass::complex<float> cutlass::from_real< cutlass::complex< float > > ( double  r)
template<>
CUTLASS_HOST_DEVICE cutlass::complex<half_t> cutlass::from_real< cutlass::complex< half_t > > ( double  r)
template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::gcd ( value_t  a,
value_t  b 
)

Greatest common divisor

CUTLASS_HOST_DEVICE float const& cutlass::imag ( cuFloatComplex const &  z)
CUTLASS_HOST_DEVICE float& cutlass::imag ( cuFloatComplex &  z)
CUTLASS_HOST_DEVICE double const& cutlass::imag ( cuDoubleComplex const &  z)
CUTLASS_HOST_DEVICE double& cutlass::imag ( cuDoubleComplex &  z)
template<typename T >
CUTLASS_HOST_DEVICE T const& cutlass::imag ( complex< T > const &  z)
template<typename T >
CUTLASS_HOST_DEVICE T& cutlass::imag ( complex< T > &  z)
CUTLASS_HOST_DEVICE bool cutlass::isfinite ( cutlass::half_t const &  h)
CUTLASS_HOST_DEVICE bool cutlass::isinf ( cutlass::half_t const &  h)
CUTLASS_HOST_DEVICE bool cutlass::isnan ( cutlass::half_t const &  h)
CUTLASS_HOST_DEVICE bool cutlass::isnormal ( cutlass::half_t const &  h)
CUTLASS_HOST_DEVICE constexpr bool cutlass::ispow2 ( unsigned  x)
template<typename Operator >
__global__ void cutlass::Kernel ( typename Operator::Params  params)
template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::lcm ( value_t  a,
value_t  b 
)

Least common multiple

template<typename T >
CUTLASS_HOST_DEVICE complex<T> cutlass::log ( complex< T > const &  z)
template<typename T >
CUTLASS_HOST_DEVICE complex<T> cutlass::log10 ( complex< T > const &  z)
CUTLASS_HOST_DEVICE Coord<1> cutlass::make_Coord ( int  _0)
CUTLASS_HOST_DEVICE Coord<2> cutlass::make_Coord ( int  _0,
int  _1 
)
CUTLASS_HOST_DEVICE Coord<3> cutlass::make_Coord ( int  _0,
int  _1,
int  _2 
)
CUTLASS_HOST_DEVICE Coord<4> cutlass::make_Coord ( int  _0,
int  _1,
int  _2,
int  _3 
)
template<typename Element , typename Layout >
CUTLASS_HOST_DEVICE TensorRef<Element, Layout> cutlass::make_TensorRef ( Element *  ptr,
Layout const &  layout 
)
template<typename Element , typename Layout >
CUTLASS_HOST_DEVICE TensorView<Element, Layout> cutlass::make_TensorView ( Element *  ptr,
Layout const &  layout,
typename Layout::TensorCoord const &  extent 
)
CUTLASS_HOST_DEVICE cutlass::half_t cutlass::nanh ( const char *  )
template<typename T >
CUTLASS_HOST_DEVICE T cutlass::norm ( T const &  z)
template<>
CUTLASS_HOST_DEVICE int8_t cutlass::norm ( int8_t const &  z)
template<typename T >
CUTLASS_HOST_DEVICE double cutlass::norm ( complex< T > const &  z)
template<typename T , typename R >
CUTLASS_HOST_DEVICE R cutlass::norm_accumulate ( T const &  x,
R const &  accumulator 
)
template<typename T , typename R >
CUTLASS_HOST_DEVICE R cutlass::norm_accumulate ( complex< T > const &  z,
R const &  accumulator 
)
CUTLASS_HOST_DEVICE bool cutlass::operator!= ( half_t const &  lhs,
half_t const &  rhs 
)
CUTLASS_HOST_DEVICE half_t cutlass::operator* ( half_t const &  lhs,
half_t const &  rhs 
)
CUTLASS_HOST_DEVICE half_t& cutlass::operator*= ( half_t lhs,
half_t const &  rhs 
)
CUTLASS_HOST_DEVICE half_t cutlass::operator+ ( half_t const &  lhs,
half_t const &  rhs 
)
CUTLASS_HOST_DEVICE half_t& cutlass::operator++ ( half_t lhs)
CUTLASS_HOST_DEVICE half_t cutlass::operator++ ( half_t lhs,
int   
)
CUTLASS_HOST_DEVICE half_t& cutlass::operator+= ( half_t lhs,
half_t const &  rhs 
)
CUTLASS_HOST_DEVICE half_t cutlass::operator- ( half_t const &  lhs)
CUTLASS_HOST_DEVICE half_t cutlass::operator- ( half_t const &  lhs,
half_t const &  rhs 
)
CUTLASS_HOST_DEVICE half_t& cutlass::operator-- ( half_t lhs)
CUTLASS_HOST_DEVICE half_t cutlass::operator-- ( half_t lhs,
int   
)
CUTLASS_HOST_DEVICE half_t& cutlass::operator-= ( half_t lhs,
half_t const &  rhs 
)
template<int Rank, typename Index >
CUTLASS_HOST_DEVICE Coord<Rank, Index> cutlass::operator/ ( Index  s,
Coord< Rank, Index >  coord 
)
template<int Rank, typename Index >
CUTLASS_HOST_DEVICE Coord<Rank, Index> cutlass::operator/ ( Coord< Rank, Index >  coord,
Index  s 
)
CUTLASS_HOST_DEVICE half_t cutlass::operator/ ( half_t const &  lhs,
half_t const &  rhs 
)
CUTLASS_HOST_DEVICE half_t& cutlass::operator/= ( half_t lhs,
half_t const &  rhs 
)
CUTLASS_HOST_DEVICE bool cutlass::operator< ( half_t const &  lhs,
half_t const &  rhs 
)
template<int Rank>
std::ostream& cutlass::operator<< ( std::ostream &  out,
Coord< Rank > const &  coord 
)
inline
std::ostream& cutlass::operator<< ( std::ostream &  out,
cudaError_t  result 
)
inline
std::ostream& cutlass::operator<< ( std::ostream &  out,
cuda_exception const &  e 
)
inline
std::ostream& cutlass::operator<< ( std::ostream &  out,
half_t const &  x 
)
inline
template<typename T >
std::ostream& cutlass::operator<< ( std::ostream &  out,
ScalarIO< T > const &  scalar 
)
inline
template<>
std::ostream& cutlass::operator<< ( std::ostream &  out,
ScalarIO< int8_t > const &  scalar 
)
inline
template<>
std::ostream& cutlass::operator<< ( std::ostream &  out,
ScalarIO< uint8_t > const &  scalar 
)
inline
template<typename Element , typename Layout >
std::ostream& cutlass::operator<< ( std::ostream &  out,
TensorView< Element, Layout > const &  view 
)
inline
template<typename T >
std::ostream& cutlass::operator<< ( std::ostream &  out,
complex< T > const &  z 
)
CUTLASS_HOST_DEVICE bool cutlass::operator<= ( half_t const &  lhs,
half_t const &  rhs 
)
CUTLASS_HOST_DEVICE bool cutlass::operator== ( half_t const &  lhs,
half_t const &  rhs 
)
CUTLASS_HOST_DEVICE bool cutlass::operator> ( half_t const &  lhs,
half_t const &  rhs 
)
CUTLASS_HOST_DEVICE bool cutlass::operator>= ( half_t const &  lhs,
half_t const &  rhs 
)
std::istream& cutlass::operator>> ( std::istream &  stream,
half_t x 
)
inline
template<typename T >
CUTLASS_HOST_DEVICE complex<T> cutlass::polar ( T const &  r,
T const &  theta = T() 
)
template<typename T >
CUTLASS_HOST_DEVICE complex<T> cutlass::proj ( complex< T > const &  z)
CUTLASS_HOST_DEVICE float const& cutlass::real ( cuFloatComplex const &  z)
CUTLASS_HOST_DEVICE float& cutlass::real ( cuFloatComplex &  z)
CUTLASS_HOST_DEVICE double const& cutlass::real ( cuDoubleComplex const &  z)
CUTLASS_HOST_DEVICE double& cutlass::real ( cuDoubleComplex &  z)
template<typename T >
CUTLASS_HOST_DEVICE T const& cutlass::real ( complex< T > const &  z)
template<typename T >
CUTLASS_HOST_DEVICE T& cutlass::real ( complex< T > &  z)
template<typename T >
CUTLASS_HOST_DEVICE bool cutlass::relatively_equal ( a,
b,
epsilon,
nonzero_floor 
)
template<>
CUTLASS_HOST_DEVICE bool cutlass::relatively_equal< double > ( double  a,
double  b,
double  epsilon,
double  nonzero_floor 
)
template<>
CUTLASS_HOST_DEVICE bool cutlass::relatively_equal< float > ( float  a,
float  b,
float  epsilon,
float  nonzero_floor 
)
template<>
CUTLASS_HOST_DEVICE bool cutlass::relatively_equal< half_t > ( half_t  a,
half_t  b,
half_t  epsilon,
half_t  nonzero_floor 
)
template<>
CUTLASS_HOST_DEVICE bool cutlass::relatively_equal< int16_t > ( int16_t  a,
int16_t  b,
int16_t  ,
int16_t   
)
template<>
CUTLASS_HOST_DEVICE bool cutlass::relatively_equal< int32_t > ( int32_t  a,
int32_t  b,
int32_t  ,
int32_t   
)
template<>
CUTLASS_HOST_DEVICE bool cutlass::relatively_equal< int64_t > ( int64_t  a,
int64_t  b,
int64_t  ,
int64_t   
)
template<>
CUTLASS_HOST_DEVICE bool cutlass::relatively_equal< int8_t > ( int8_t  a,
int8_t  b,
int8_t  ,
int8_t   
)
template<>
CUTLASS_HOST_DEVICE bool cutlass::relatively_equal< uint16_t > ( uint16_t  a,
uint16_t  b,
uint16_t  ,
uint16_t   
)
template<>
CUTLASS_HOST_DEVICE bool cutlass::relatively_equal< uint32_t > ( uint32_t  a,
uint32_t  b,
uint32_t  ,
uint32_t   
)
template<>
CUTLASS_HOST_DEVICE bool cutlass::relatively_equal< uint64_t > ( uint64_t  a,
uint64_t  b,
uint64_t  ,
uint64_t   
)
template<>
CUTLASS_HOST_DEVICE bool cutlass::relatively_equal< uint8_t > ( uint8_t  a,
uint8_t  b,
uint8_t  ,
uint8_t   
)
template<int Interleaved, typename Element , typename Layout >
void cutlass::reorder_column ( TensorRef< Element, Layout >  dest,
TensorRef< Element, Layout >  src,
cutlass::gemm::GemmCoord  problem_size 
)
template<typename dividend_t , typename divisor_t >
CUTLASS_HOST_DEVICE dividend_t cutlass::round_nearest ( dividend_t  dividend,
divisor_t  divisor 
)

Round dividend up to the nearest multiple of divisor

CUTLASS_HOST_DEVICE bool cutlass::signbit ( cutlass::half_t const &  h)
template<typename T >
CUTLASS_HOST_DEVICE complex<T> cutlass::sin ( complex< T > const &  z)
template<typename T >
CUTLASS_HOST_DEVICE complex<T> cutlass::sqrt ( complex< T > const &  z)
CUTLASS_HOST_DEVICE cutlass::half_t cutlass::sqrt ( cutlass::half_t const &  h)
template<typename Element , typename Layout >
bool cutlass::TensorRef_aligned ( TensorRef< Element, Layout > const &  ref,
int  alignment 
)
template<typename Element , typename Layout >
std::ostream& cutlass::TensorViewWrite ( std::ostream &  out,
TensorView< Element, Layout > const &  view 
)
inline