CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers

Implements a softwarepipelined efficient batched reduction. D = alpha * Reduction(A) + beta * C. More...
#include <cuda.h>
#include "cutlass/coord.h"
#include "cutlass/util/platform.h"
#include "cutlass/fragment.h"
Classes  
struct  cutlass::reduction::BatchedReduction< BatchedReductionTraits_ > 
Namespaces  
cutlass  
cutlass::reduction  
Functions  
template<typename batched_reduction_ >  
__global__  cutlass::reduction::__launch_bounds__ (batched_reduction_::Traits::kThreads, 1) void batched_reduction_kernel(typename batched_reduction_ 