CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
Classes | Public Types | Public Member Functions | Static Public Member Functions | Static Public Attributes | List of all members
cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage > Class Template Reference

#include <reduce_split_k.h>

Classes

struct  Params
 Params structure. More...
 
struct  SharedStorage
 

Public Types

using Shape = Shape_
 
using ReductionOp = ReductionOp_
 
using OutputOp = OutputOp_
 
using ElementWorkspace = typename ReductionOp::Element
 
using ElementAccumulator = typename ReductionOp::ElementAccumulator
 
using ElementOutput = typename OutputOp::ElementOutput
 
using WorkspaceTensorRef = TensorRef< ElementWorkspace, layout::RowMajor >
 
using OutputTensorRef = TensorRef< ElementOutput, layout::RowMajor >
 
using FragmentWorkspace = AlignedArray< ElementWorkspace, kElementsPerAccess >
 
using FragmentAccumulator = Array< ElementAccumulator, kElementsPerAccess >
 
using FragmentOutput = AlignedArray< ElementOutput, kElementsPerAccess >
 

Public Member Functions

CUTLASS_DEVICE void operator() (Params const &params, SharedStorage &storage)
 Perform a reduction. More...
 

Static Public Member Functions

static CUTLASS_HOST_DEVICE dim3 grid_shape (cutlass::MatrixCoord problem_size)
 Computes the grid size given a chosen threadblock shape. More...
 
static CUTLASS_HOST_DEVICE dim3 block_shape ()
 Determines the threadblock shape. More...
 

Static Public Attributes

static int const kElementsPerAccess = OutputOp::kCount
 
static int const kPartitionsPerStage = PartitionsPerStage
 

Member Typedef Documentation

template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::ElementAccumulator = typename ReductionOp::ElementAccumulator
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::ElementOutput = typename OutputOp::ElementOutput
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::ElementWorkspace = typename ReductionOp::Element
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::FragmentOutput = AlignedArray<ElementOutput, kElementsPerAccess>
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::FragmentWorkspace = AlignedArray<ElementWorkspace, kElementsPerAccess>
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::OutputOp = OutputOp_
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::OutputTensorRef = TensorRef<ElementOutput, layout::RowMajor>
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::ReductionOp = ReductionOp_
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::Shape = Shape_
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::WorkspaceTensorRef = TensorRef<ElementWorkspace, layout::RowMajor>

Member Function Documentation

template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
static CUTLASS_HOST_DEVICE dim3 cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::block_shape ( )
inlinestatic
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
static CUTLASS_HOST_DEVICE dim3 cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::grid_shape ( cutlass::MatrixCoord  problem_size)
inlinestatic
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
CUTLASS_DEVICE void cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::operator() ( Params const &  params,
SharedStorage storage 
)
inline

Member Data Documentation

template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
int const cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::kElementsPerAccess = OutputOp::kCount
static
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
int const cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::kPartitionsPerStage = PartitionsPerStage
static

The documentation for this class was generated from the following file: