CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
|
Implements a software-pipelined efficient batched reduction. D = alpha * Reduction(A) + beta * C. More...
#include <cuda.h>
#include "cutlass/coord.h"
#include "cutlass/util/platform.h"
#include "cutlass/fragment.h"
Go to the source code of this file.
Classes | |
struct | cutlass::reduction::BatchedReduction< BatchedReductionTraits_ > |
Namespaces | |
cutlass | |
cutlass::reduction | |
Functions | |
template<typename batched_reduction_ > | |
__global__ | cutlass::reduction::__launch_bounds__ (batched_reduction_::Traits::kThreads, 1) void batched_reduction_kernel(typename batched_reduction_ |