Thread-level#

In contrast to algorithms at the warp/block/device layer, single threaded functionality like cub::ThreadReduce is typically implemented as a sequential function and rarely exposed to the user.

template <
    int         LENGTH,
    typename    T,
    typename    ReductionOp,
    typename    PrefixT,
    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
__device__ __forceinline__ AccumT ThreadReduce(
    T           (&input)[LENGTH],
    ReductionOp reduction_op,
    PrefixT     prefix)
{
    return ...;
}