cub::DeviceRleDispatch#
-
template<typename InputIteratorT, typename OffsetsOutputIteratorT, typename LengthsOutputIteratorT, typename NumRunsOutputIteratorT, typename EqualityOpT, typename OffsetT, typename PolicyHub = detail::rle::non_trivial_runs::policy_hub<cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>, cub::detail::it_value_t<InputIteratorT>>>
struct DeviceRleDispatch# Utility class for dispatching the appropriately-tuned kernels for DeviceRle.
- Template Parameters:
InputIteratorT – Random-access input iterator type for reading input items (may be a simple pointer type)
OffsetsOutputIteratorT – Random-access output iterator type for writing run-offset values (may be a simple pointer type)
LengthsOutputIteratorT – Random-access output iterator type for writing run-length values (may be a simple pointer type)
NumRunsOutputIteratorT – Output iterator type for recording the number of runs encountered (may be a simple pointer type)
EqualityOpT – T equality operator type
OffsetT – Signed integer type for global offsets
PolicyHub – Implementation detail, do not specify directly, requirements on the content of this type are subject to breaking change.
Public Types
-
using local_offset_t = cuda::std::int32_t#
-
using length_t = cub::detail::non_void_value_t<LengthsOutputIteratorT, global_offset_t>#
-
using streaming_context_t = ::cuda::std::conditional_t<use_streaming_invocation, detail::rle::streaming_context<InputIteratorT, length_t, global_offset_t>, NullType>#
-
using ScanTileStateT = ReduceByKeyScanTileState<length_t, local_offset_t>#
Public Functions
- inline DeviceRleDispatch(
- void *d_temp_storage,
- size_t &temp_storage_bytes,
- InputIteratorT d_in,
- OffsetsOutputIteratorT d_offsets_out,
- LengthsOutputIteratorT d_lengths_out,
- NumRunsOutputIteratorT d_num_runs_out,
- EqualityOpT equality_op,
- global_offset_t num_items,
- cudaStream_t stream,
-
template<typename ActivePolicyT, typename DeviceScanInitKernelPtr, typename DeviceRleSweepKernelPtr>
inline cudaError_t Invoke( - DeviceScanInitKernelPtr device_scan_init_kernel,
- DeviceRleSweepKernelPtr device_rle_sweep_kernel,
Internal dispatch routine for computing a device-wide run-length-encode using the specified kernel functions.
- Template Parameters:
DeviceScanInitKernelPtr – Function type of cub::DeviceScanInitKernel
DeviceRleSweepKernelPtr – Function type of cub::DeviceRleSweepKernelPtr
- Parameters:
device_scan_init_kernel – Kernel function pointer to parameterization of cub::DeviceScanInitKernel
device_rle_sweep_kernel – Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
-
template<class ActivePolicyT>
inline cudaError_t Invoke()#
Public Members
-
void *d_temp_storage#
-
size_t &temp_storage_bytes#
-
InputIteratorT d_in#
-
OffsetsOutputIteratorT d_offsets_out#
-
LengthsOutputIteratorT d_lengths_out#
-
NumRunsOutputIteratorT d_num_runs_out#
-
EqualityOpT equality_op#
-
global_offset_t num_items#
-
cudaStream_t stream#
Public Static Functions
- static inline cudaError_t Dispatch(
- void *d_temp_storage,
- size_t &temp_storage_bytes,
- InputIteratorT d_in,
- OffsetsOutputIteratorT d_offsets_out,
- LengthsOutputIteratorT d_lengths_out,
- NumRunsOutputIteratorT d_num_runs_out,
- EqualityOpT equality_op,
- OffsetT num_items,
- cudaStream_t stream,
Internal dispatch routine.
- Parameters:
d_temp_storage – Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to
temp_storage_bytes
and no work is done.temp_storage_bytes – Reference to size in bytes of
d_temp_storage
allocationd_in – Pointer to input sequence of data items
d_offsets_out – Pointer to output sequence of run-offsets
d_lengths_out – Pointer to output sequence of run-lengths
d_num_runs_out – Pointer to total number of runs (i.e., length of
d_offsets_out
)equality_op – Equality operator for input items
num_items – Total number of input items (i.e., length of
d_in
)stream – [optional] CUDA stream to launch kernels within. Default is stream0.
Public Static Attributes
-
static constexpr bool use_streaming_invocation = cuda::std::numeric_limits<OffsetT>::max() > cuda::std::numeric_limits<local_offset_t>::max()#
-
static constexpr int init_kernel_threads = 128#