cub::DispatchSegmentedReduce#

template<typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename OffsetT, typename ReductionOpT, typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>, typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::it_value_t<InputIteratorT>, InitT>, typename PolicyHub = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>, typename KernelSource = detail::reduce::DeviceSegmentedReduceKernelSource<typename PolicyHub::MaxPolicy, InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, ReductionOpT, InitT, AccumT>, typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
struct DispatchSegmentedReduce#

Utility class for dispatching the appropriately-tuned kernels for device-wide reduction.

Template Parameters:
  • InputIteratorT – Random-access input iterator type for reading input items (may be a simple pointer type)

  • OutputIteratorT – Output iterator type for recording the reduced aggregate (may be a simple pointer type)

  • BeginOffsetIteratorT – Random-access input iterator type for reading segment beginning offsets (may be a simple pointer type)

  • EndOffsetIteratorT – Random-access input iterator type for reading segment ending offsets (may be a simple pointer type)

  • OffsetT – Signed integer type for global offsets

  • ReductionOpT – Binary reduction functor type having member auto operator()(const T &a, const U &b)

  • InitT – value type

Public Functions

inline DispatchSegmentedReduce(
void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
ReductionOpT reduction_op,
InitT init,
cudaStream_t stream,
int ptx_version,
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {},
)#

Constructor.

template<typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
inline cudaError_t InvokePasses(
DeviceSegmentedReduceKernelT segmented_reduce_kernel,
ActivePolicyT policy = {},
)#

Invocation.

Template Parameters:
  • ActivePolicyT – Umbrella policy active for the target device

  • DeviceSegmentedReduceKernelT – Function type of cub::DeviceSegmentedReduceKernel

Parameters:

segmented_reduce_kernel[in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel

template<typename ActivePolicyT>
inline cudaError_t Invoke(
ActivePolicyT policy = {},
)#

Invocation.

Public Members

void *d_temp_storage#

Device-accessible allocation of temporary storage.

When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

size_t &temp_storage_bytes#

Reference to size in bytes of d_temp_storage allocation.

InputIteratorT d_in#

Pointer to the input sequence of data items.

OutputIteratorT d_out#

Pointer to the output aggregate.

::cuda::std::int64_t num_segments#

The number of segments that comprise the segmented reduction data.

BeginOffsetIteratorT d_begin_offsets#

Random-access input iterator to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_*

EndOffsetIteratorT d_end_offsets#

Random-access input iterator to the sequence of ending offsets of length num_segments, such that d_end_offsets[i] - 1 is the last element of the ith data segment in d_keys_* and d_values_*.

If d_end_offsets[i] - 1 <= d_begin_offsets[i], the ith is considered empty.

ReductionOpT reduction_op#

Binary reduction functor.

InitT init#

The initial value of the reduction.

cudaStream_t stream#

CUDA stream to launch kernels within. Default is stream0.

int ptx_version#
KernelSource kernel_source#
KernelLauncherFactory launcher_factory#

Public Static Functions

template<typename MaxPolicyT = typename PolicyHub::MaxPolicy>
static inline cudaError_t Dispatch(
void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
ReductionOpT reduction_op,
InitT init,
cudaStream_t stream,
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {},
MaxPolicyT max_policy = {},
)#

Internal dispatch routine for computing a device-wide reduction.

Parameters:
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • d_in[in] Pointer to the input sequence of data items

  • d_out[out] Pointer to the output aggregate

  • num_segments[in] The number of segments that comprise the sorting data

  • d_begin_offsets[in] Random-access input iterator to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_*

  • d_end_offsets[in] Random-access input iterator to the sequence of ending offsets of length num_segments, such that d_end_offsets[i] - 1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i] - 1 <= d_begin_offsets[i], the ith is considered empty.

  • reduction_op[in] Binary reduction functor

  • init[in] The initial value of the reduction

  • stream[in] [optional] CUDA stream to launch kernels within. Default is stream0.