cub::DispatchSegmentedReduce#

template<typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename OffsetT, typename ReductionOpT, typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>, typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::it_value_t<InputIteratorT>, InitT>, typename PolicyHub = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>, typename KernelSource = detail::reduce::DeviceSegmentedReduceKernelSource<typename PolicyHub::MaxPolicy, InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, ReductionOpT, InitT, AccumT>, typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> struct DispatchSegmentedReduce#

Utility class for dispatching the appropriately-tuned kernels for device-wide reduction.

Template Parameters:

InputIteratorT – Random-access input iterator type for reading input items (may be a simple pointer type)
OutputIteratorT – Output iterator type for recording the reduced aggregate (may be a simple pointer type)
BeginOffsetIteratorT – Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)
EndOffsetIteratorT – Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)
OffsetT – Signed integer type for global offsets
ReductionOpT – Binary reduction functor type having member auto operator()(const T &a, const U &b)
InitT – value type

Public Functions

inline DispatchSegmentedReduce( void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, InitT init, cudaStream_t stream, int ptx_version, KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {} )#: Constructor.

template<typename ActivePolicyT, typename DeviceSegmentedReduceKernelT> inline cudaError_t InvokePasses( DeviceSegmentedReduceKernelT segmented_reduce_kernel, ActivePolicyT policy = {} )#

Invocation.

Template Parameters:

ActivePolicyT – Umbrella policy active for the target device
DeviceSegmentedReduceKernelT – Function type of cub::DeviceSegmentedReduceKernel

Parameters:

segmented_reduce_kernel – [in] Kernel function pointer to instantiation of cub::DeviceSegmentedReduceKernel

template<typename ActivePolicyT> inline cudaError_t Invoke( ActivePolicyT policy = {} )#: Invocation.

Public Members

void *d_temp_storage#

Device-accessible allocation of temporary storage.

When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

size_t &temp_storage_bytes#: Reference to size in bytes of d_temp_storage allocation.

InputIteratorT d_in#: Pointer to the input sequence of data items.

OutputIteratorT d_out#: Pointer to the output aggregate.

::cuda::std::int64_t num_segments#: The number of segments that comprise the segmented reduction data.

BeginOffsetIteratorT d_begin_offsets#: Random-access input iterator to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i] is the first element of the i^th data segment in d_keys_* and d_values_*

EndOffsetIteratorT d_end_offsets#

Random-access input iterator to the sequence of ending offsets of length num_segments, such that d_end_offsets[i] - 1 is the last element of the i^th data segment in d_keys_* and d_values_*.

If d_end_offsets[i] - 1 <= d_begin_offsets[i], the i^th is considered empty.

ReductionOpT reduction_op#: Binary reduction functor.

InitT init#: The initial value of the reduction.

cudaStream_t stream#: CUDA stream to launch kernels within. Default is stream₀.

int ptx_version#

KernelSource kernel_source#

KernelLauncherFactory launcher_factory#

Public Static Functions

template<typename MaxPolicyT = typename PolicyHub::MaxPolicy> static inline cudaError_t Dispatch( void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, InitT init, cudaStream_t stream, KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {} )#

Internal dispatch routine for computing a device-wide reduction.

Parameters:

d_temp_storage – [in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.
temp_storage_bytes – [inout] Reference to size in bytes of d_temp_storage allocation
d_in – [in] Pointer to the input sequence of data items
d_out – [out] Pointer to the output aggregate
num_segments – [in] The number of segments that comprise the sorting data
d_begin_offsets – [in] Random-access input iterator to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i] is the first element of the i^th data segment in d_keys_* and d_values_*
d_end_offsets – [in] Random-access input iterator to the sequence of ending offsets of length num_segments, such that d_end_offsets[i] - 1 is the last element of the i^th data segment in d_keys_* and d_values_*. If d_end_offsets[i] - 1 <= d_begin_offsets[i], the i^th is considered empty.
reduction_op – [in] Binary reduction functor
init – [in] The initial value of the reduction
stream – [in] [optional] CUDA stream to launch kernels within. Default is stream₀.