cub::DispatchReduceByKey#

template<typename KeysInputIteratorT, typename UniqueOutputIteratorT, typename ValuesInputIteratorT, typename AggregatesOutputIteratorT, typename NumRunsOutputIteratorT, typename EqualityOpT, typename ReductionOpT, typename OffsetT, typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::it_value_t<ValuesInputIteratorT>, cub::detail::it_value_t<ValuesInputIteratorT>>, typename PolicyHub = detail::reduce_by_key::policy_hub<ReductionOpT, AccumT, cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::it_value_t<KeysInputIteratorT>>>>
struct DispatchReduceByKey#

Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey.

Template Parameters:
  • KeysInputIteratorT – Random-access input iterator type for keys

  • UniqueOutputIteratorT – Random-access output iterator type for keys

  • ValuesInputIteratorT – Random-access input iterator type for values

  • AggregatesOutputIteratorT – Random-access output iterator type for values

  • NumRunsOutputIteratorT – Output iterator type for recording number of segments encountered

  • EqualityOpT – KeyT equality operator type

  • ReductionOpT – ValueT reduction operator type

  • OffsetT – Signed integer type for global offsets

  • PolicyHub – Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.

Public Types

using ValueInputT = cub::detail::it_value_t<ValuesInputIteratorT>#
using streaming_context_t = NullType#
using ScanTileStateT = ReduceByKeyScanTileState<AccumT, OffsetT>#

Public Functions

inline DispatchReduceByKey(
void *d_temp_storage,
size_t &temp_storage_bytes,
KeysInputIteratorT d_keys_in,
UniqueOutputIteratorT d_unique_out,
ValuesInputIteratorT d_values_in,
AggregatesOutputIteratorT d_aggregates_out,
NumRunsOutputIteratorT d_num_runs_out,
EqualityOpT equality_op,
ReductionOpT reduction_op,
OffsetT num_items,
cudaStream_t stream,
)#
template<typename ActivePolicyT, typename ScanInitKernelT, typename ReduceByKeyKernelT>
inline cudaError_t Invoke(
ScanInitKernelT init_kernel,
ReduceByKeyKernelT reduce_by_key_kernel,
)#
template<typename ActivePolicyT>
inline cudaError_t Invoke()#

Public Members

void *d_temp_storage#
size_t &temp_storage_bytes#
KeysInputIteratorT d_keys_in#
UniqueOutputIteratorT d_unique_out#
ValuesInputIteratorT d_values_in#
AggregatesOutputIteratorT d_aggregates_out#
NumRunsOutputIteratorT d_num_runs_out#
EqualityOpT equality_op#
ReductionOpT reduction_op#
OffsetT num_items#
cudaStream_t stream#

Public Static Functions

static inline cudaError_t Dispatch(
void *d_temp_storage,
size_t &temp_storage_bytes,
KeysInputIteratorT d_keys_in,
UniqueOutputIteratorT d_unique_out,
ValuesInputIteratorT d_values_in,
AggregatesOutputIteratorT d_aggregates_out,
NumRunsOutputIteratorT d_num_runs_out,
EqualityOpT equality_op,
ReductionOpT reduction_op,
OffsetT num_items,
cudaStream_t stream,
)#

Internal dispatch routine.

Parameters:
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • d_keys_in[in] Pointer to the input sequence of keys

  • d_unique_out[out] Pointer to the output sequence of unique keys (one key per run)

  • d_values_in[in] Pointer to the input sequence of corresponding values

  • d_aggregates_out[out] Pointer to the output sequence of value aggregates (one aggregate per run)

  • d_num_runs_out[out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)

  • equality_op[in] KeyT equality operator

  • reduction_op[in] ValueT reduction operator

  • num_items[in] Total number of items to select from

  • stream[in] CUDA stream to launch kernels within. Default is stream0.

Public Static Attributes

static constexpr int INIT_KERNEL_THREADS = 128#