cub::DeviceRleDispatch#

template<typename InputIteratorT, typename OffsetsOutputIteratorT, typename LengthsOutputIteratorT, typename NumRunsOutputIteratorT, typename EqualityOpT, typename OffsetT, typename PolicyHub = detail::rle::non_trivial_runs::policy_hub<cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>, cub::detail::it_value_t<InputIteratorT>>> struct DeviceRleDispatch#

Utility class for dispatching the appropriately-tuned kernels for DeviceRle.

Template Parameters:

InputIteratorT – Random-access input iterator type for reading input items (may be a simple pointer type)
OffsetsOutputIteratorT – Random-access output iterator type for writing run-offset values (may be a simple pointer type)
LengthsOutputIteratorT – Random-access output iterator type for writing run-length values (may be a simple pointer type)
NumRunsOutputIteratorT – Output iterator type for recording the number of runs encountered (may be a simple pointer type)
EqualityOpT – T equality operator type
OffsetT – Signed integer type for global offsets
PolicyHub – Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.

Public Types

using local_offset_t = ::cuda::std::int32_t#

using global_offset_t = OffsetT #

using length_t = cub::detail::non_void_value_t<LengthsOutputIteratorT, global_offset_t>#

using streaming_context_t = ::cuda::std::conditional_t<use_streaming_invocation, detail::rle::streaming_context<InputIteratorT, length_t, global_offset_t>, NullType>#

using ScanTileStateT = ReduceByKeyScanTileState<length_t, local_offset_t>#

Public Functions

inline DeviceRleDispatch( void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, global_offset_t num_items, cudaStream_t stream )#

template<typename ActivePolicyT, typename DeviceScanInitKernelPtr, typename DeviceRleSweepKernelPtr> inline cudaError_t Invoke( DeviceScanInitKernelPtr device_scan_init_kernel, DeviceRleSweepKernelPtr device_rle_sweep_kernel )#

Internal dispatch routine for computing a device-wide run-length-encode using the specified kernel functions.

Template Parameters:

DeviceScanInitKernelPtr – Function type of cub::DeviceScanInitKernel
DeviceRleSweepKernelPtr – Function type of cub::DeviceRleSweepKernelPtr

Parameters:

device_scan_init_kernel – Kernel function pointer to parameterization of cub::DeviceScanInitKernel
device_rle_sweep_kernel – Kernel function pointer to parameterization of cub::DeviceRleSweepKernel

template<class ActivePolicyT> inline cudaError_t Invoke()#

Public Members

void *d_temp_storage#

size_t &temp_storage_bytes#

InputIteratorT d_in#

OffsetsOutputIteratorT d_offsets_out#

LengthsOutputIteratorT d_lengths_out#

NumRunsOutputIteratorT d_num_runs_out#

EqualityOpT equality_op#

global_offset_t num_items#

cudaStream_t stream#

Public Static Functions

static inline cudaError_t Dispatch( void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream )#

Internal dispatch routine.

Parameters:

d_temp_storage – Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.
temp_storage_bytes – Reference to size in bytes of d_temp_storage allocation
d_in – Pointer to input sequence of data items
d_offsets_out – Pointer to output sequence of run-offsets
d_lengths_out – Pointer to output sequence of run-lengths
d_num_runs_out – Pointer to total number of runs (i.e., length of d_offsets_out)
equality_op – Equality operator for input items
num_items – Total number of input items (i.e., length of d_in)
stream – [optional] CUDA stream to launch kernels within. Default is stream₀.

Public Static Attributes

static constexpr bool use_streaming_invocation = ::cuda::std::numeric_limits<OffsetT>::max() > ::cuda::std::numeric_limits<local_offset_t>::max()#

static constexpr int init_kernel_threads = 128#