cub::DeviceRleDispatch#

template<typename InputIteratorT, typename OffsetsOutputIteratorT, typename LengthsOutputIteratorT, typename NumRunsOutputIteratorT, typename EqualityOpT, typename OffsetT, typename PolicyHub = detail::rle::non_trivial_runs::policy_hub<cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>, cub::detail::it_value_t<InputIteratorT>>>
struct DeviceRleDispatch#

Utility class for dispatching the appropriately-tuned kernels for DeviceRle.

Template Parameters:
  • InputIteratorT – Random-access input iterator type for reading input items (may be a simple pointer type)

  • OffsetsOutputIteratorT – Random-access output iterator type for writing run-offset values (may be a simple pointer type)

  • LengthsOutputIteratorT – Random-access output iterator type for writing run-length values (may be a simple pointer type)

  • NumRunsOutputIteratorT – Output iterator type for recording the number of runs encountered (may be a simple pointer type)

  • EqualityOpT – T equality operator type

  • OffsetT – Signed integer type for global offsets

  • PolicyHub – Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.

Public Types

using local_offset_t = cuda::std::int32_t#
using global_offset_t = OffsetT#
using length_t = cub::detail::non_void_value_t<LengthsOutputIteratorT, global_offset_t>#
using streaming_context_t = ::cuda::std::conditional_t<use_streaming_invocation, detail::rle::streaming_context<InputIteratorT, length_t, global_offset_t>, NullType>#
using ScanTileStateT = ReduceByKeyScanTileState<length_t, local_offset_t>#

Public Functions

inline DeviceRleDispatch(
void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OffsetsOutputIteratorT d_offsets_out,
LengthsOutputIteratorT d_lengths_out,
NumRunsOutputIteratorT d_num_runs_out,
EqualityOpT equality_op,
global_offset_t num_items,
cudaStream_t stream,
)#
template<typename ActivePolicyT, typename DeviceScanInitKernelPtr, typename DeviceRleSweepKernelPtr>
inline cudaError_t Invoke(
DeviceScanInitKernelPtr device_scan_init_kernel,
DeviceRleSweepKernelPtr device_rle_sweep_kernel,
)#

Internal dispatch routine for computing a device-wide run-length-encode using the specified kernel functions.

Template Parameters:
  • DeviceScanInitKernelPtr – Function type of cub::DeviceScanInitKernel

  • DeviceRleSweepKernelPtr – Function type of cub::DeviceRleSweepKernelPtr

Parameters:
  • device_scan_init_kernel – Kernel function pointer to parameterization of cub::DeviceScanInitKernel

  • device_rle_sweep_kernel – Kernel function pointer to parameterization of cub::DeviceRleSweepKernel

template<class ActivePolicyT>
inline cudaError_t Invoke()#

Public Members

void *d_temp_storage#
size_t &temp_storage_bytes#
InputIteratorT d_in#
OffsetsOutputIteratorT d_offsets_out#
LengthsOutputIteratorT d_lengths_out#
NumRunsOutputIteratorT d_num_runs_out#
EqualityOpT equality_op#
global_offset_t num_items#
cudaStream_t stream#

Public Static Functions

static inline cudaError_t Dispatch(
void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OffsetsOutputIteratorT d_offsets_out,
LengthsOutputIteratorT d_lengths_out,
NumRunsOutputIteratorT d_num_runs_out,
EqualityOpT equality_op,
OffsetT num_items,
cudaStream_t stream,
)#

Internal dispatch routine.

Parameters:
  • d_temp_storage – Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes – Reference to size in bytes of d_temp_storage allocation

  • d_in – Pointer to input sequence of data items

  • d_offsets_out – Pointer to output sequence of run-offsets

  • d_lengths_out – Pointer to output sequence of run-lengths

  • d_num_runs_out – Pointer to total number of runs (i.e., length of d_offsets_out)

  • equality_op – Equality operator for input items

  • num_items – Total number of input items (i.e., length of d_in)

  • stream[optional] CUDA stream to launch kernels within. Default is stream0.

Public Static Attributes

static constexpr bool use_streaming_invocation = cuda::std::numeric_limits<OffsetT>::max() > cuda::std::numeric_limits<local_offset_t>::max()#
static constexpr int init_kernel_threads = 128#