cub::DispatchReduce#

template<typename InputIteratorT, typename OutputIteratorT, typename OffsetT, typename ReductionOpT, typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>, typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::it_value_t<InputIteratorT>, InitT>, typename TransformOpT = ::cuda::std::identity, typename PolicyHub = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>, typename KernelSource = detail::reduce::DeviceReduceKernelSource<typename PolicyHub::MaxPolicy, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, InitT, AccumT, TransformOpT>, typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
struct DispatchReduce#

Utility class for dispatching the appropriately-tuned kernels for device-wide reduction.

Template Parameters:
  • InputIteratorT – Random-access input iterator type for reading input items (may be a simple pointer type)

  • OutputIteratorT – Output iterator type for recording the reduced aggregate (may be a simple pointer type)

  • OffsetT – Signed integer type for global offsets

  • ReductionOpT – Binary reduction functor type having member auto operator()(const T &a, const U &b)

  • InitT – Initial value type

Public Functions

inline DispatchReduce(
void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
OffsetT num_items,
ReductionOpT reduction_op,
InitT init,
cudaStream_t stream,
int ptx_version,
TransformOpT transform_op = {},
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {},
)#

Constructor.

template<typename ActivePolicyT, typename SingleTileKernelT>
inline cudaError_t InvokeSingleTile(
SingleTileKernelT single_tile_kernel,
ActivePolicyT policy = {},
)#

Invoke a single block block to reduce in-core.

Template Parameters:
  • ActivePolicyT – Umbrella policy active for the target device

  • SingleTileKernelT – Function type of cub::DeviceReduceSingleTileKernel

Parameters:

single_tile_kernel[in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel

template<typename ActivePolicyT, typename ReduceKernelT, typename SingleTileKernelT>
inline cudaError_t InvokePasses(
ReduceKernelT reduce_kernel,
SingleTileKernelT single_tile_kernel,
ActivePolicyT active_policy = {},
)#

Invoke two-passes to reduce.

Template Parameters:
  • ActivePolicyT – Umbrella policy active for the target device

  • ReduceKernelT – Function type of cub::DeviceReduceKernel

  • SingleTileKernelT – Function type of cub::DeviceReduceSingleTileKernel

Parameters:
  • reduce_kernel[in] Kernel function pointer to parameterization of cub::DeviceReduceKernel

  • single_tile_kernel[in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel

template<typename ActivePolicyT>
inline cudaError_t Invoke(
ActivePolicyT active_policy = {},
)#

Invocation.

Public Members

void *d_temp_storage#

Device-accessible allocation of temporary storage.

When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

size_t &temp_storage_bytes#

Reference to size in bytes of d_temp_storage allocation.

InputIteratorT d_in#

Pointer to the input sequence of data items.

OutputIteratorT d_out#

Pointer to the output aggregate.

OffsetT num_items#

Total number of input items (i.e., length of d_in)

ReductionOpT reduction_op#

Binary reduction functor.

InitT init#

The initial value of the reduction.

cudaStream_t stream#

CUDA stream to launch kernels within. Default is stream0.

int ptx_version#
TransformOpT transform_op#
KernelSource kernel_source#
KernelLauncherFactory launcher_factory#

Public Static Functions

template<typename MaxPolicyT = typename PolicyHub::MaxPolicy>
static inline cudaError_t Dispatch(
void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
OffsetT num_items,
ReductionOpT reduction_op,
InitT init,
cudaStream_t stream,
TransformOpT transform_op = {},
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {},
MaxPolicyT max_policy = {},
)#

Internal dispatch routine for computing a device-wide reduction.

Parameters:
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • d_in[in] Pointer to the input sequence of data items

  • d_out[out] Pointer to the output aggregate

  • num_items[in] Total number of input items (i.e., length of d_in)

  • reduction_op[in] Binary reduction functor

  • init[in] The initial value of the reduction

  • stream[in] [optional] CUDA stream to launch kernels within. Default is stream0.