cub::DeviceCopy#
-
struct DeviceCopy#
cub::DeviceCopy provides device-wide, parallel operations for copying data.
Public Static Functions
-
template<typename InputIt, typename OutputIt, typename SizeIteratorT>
static inline cudaError_t Batched( - void *d_temp_storage,
- size_t &temp_storage_bytes,
- InputIt input_it,
- OutputIt output_it,
- SizeIteratorT sizes,
- ::cuda::std::int64_t num_ranges,
- cudaStream_t stream = nullptr,
Copies data from a batch of given source ranges to their corresponding destination ranges.
Snippet#
The code snippet below illustrates usage of DeviceCopy::Batched to perform a DeviceRunLength Decode operation.
struct GetIteratorToRange { __host__ __device__ __forceinline__ auto operator()(uint32_t index) { return thrust::make_constant_iterator(d_data_in[index]); } int32_t *d_data_in; }; struct GetPtrToRange { __host__ __device__ __forceinline__ auto operator()(uint32_t index) { return d_data_out + d_offsets[index]; } int32_t *d_data_out; uint32_t *d_offsets; }; struct GetRunLength { __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index) { return d_offsets[index + 1] - d_offsets[index]; } uint32_t *d_offsets; }; uint32_t num_ranges = 5; int32_t *d_data_in; // e.g., [4, 2, 7, 3, 1] int32_t *d_data_out; // e.g., [0, ... ] uint32_t *d_offsets; // e.g., [0, 2, 5, 6, 9, 14] // Returns a constant iterator to the element of the i-th run thrust::counting_iterator<uint32_t> iota(0); auto iterators_in = thrust::make_transform_iterator(iota, GetIteratorToRange{d_data_in}); // Returns the run length of the i-th run auto sizes = thrust::make_transform_iterator(iota, GetRunLength{d_offsets}); // Returns pointers to the output range for each run auto ptrs_out = thrust::make_transform_iterator(iota, GetPtrToRange{d_data_out, d_offsets}); // Determine temporary device storage requirements void *d_temp_storage = nullptr; size_t temp_storage_bytes = 0; cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes, num_ranges); // Allocate temporary storage cudaMalloc(&d_temp_storage, temp_storage_bytes); // Run batched copy algorithm (used to perform runlength decoding) cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes, num_ranges); // d_data_out <-- [4, 4, 2, 2, 2, 7, 3, 3, 3, 1, 1, 1, 1, 1]
Note
If any input range aliases any output range the behavior is undefined. If any output range aliases another output range the behavior is undefined. Input ranges can alias one another.
- Template Parameters:
InputIt – [inferred] Device-accessible random-access input iterator type providing the iterators to the source ranges
OutputIt – [inferred] Device-accessible random-access input iterator type providing the iterators to the destination ranges
SizeIteratorT – [inferred] Device-accessible random-access input iterator type providing the number of items to be copied for each pair of ranges
- Parameters:
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationinput_it – [in] Device-accessible iterator providing the iterators to the source ranges
output_it – [in] Device-accessible iterator providing the iterators to the destination ranges
sizes – [in] Device-accessible iterator providing the number of elements to be copied for each pair of ranges
num_ranges – [in] The total number of range pairs
stream – [in]
[optional] CUDA stream to launch kernels within. Default is stream0.
-
template<typename T_In, typename Extents_In, typename Layout_In, typename Accessor_In, typename T_Out, typename Extents_Out, typename Layout_Out, typename Accessor_Out>
static inline cudaError_t Copy( - void *d_temp_storage,
- size_t &temp_storage_bytes,
- ::cuda::std::mdspan<T_In, Extents_In, Layout_In, Accessor_In> mdspan_in,
- ::cuda::std::mdspan<T_Out, Extents_Out, Layout_Out, Accessor_Out> mdspan_out,
- ::cudaStream_t stream = nullptr,
Copies data from a multidimensional source mdspan to a destination mdspan.
This function performs a parallel copy operation between two mdspan objects with potentially different layouts but identical extents. The copy operation handles arbitrary-dimensional arrays and automatically manages layout transformations.
Preconditions#
The source and destination mdspans must have identical extents (same ranks and sizes).
The source and destination mdspans data handle must not be nullptr if the size is not 0.
The underlying memory of the source and destination must not overlap.
Both mdspans must point to device memory.
Snippet#
The code snippet below illustrates usage of DeviceCopy::Copy to copy between mdspans.
// Example: Copy a 2D array from row-major to column-major layout constexpr int N = 10; constexpr int M = 8; // Allocate device memory using thrust::device_vector thrust::device_vector<float> d_input(N * M); thrust::device_vector<float> d_output(N * M, thrust::no_init); using extents_t = cuda::std::extents<int, N, M>; using mdspan_in_t = cuda::std::mdspan<float, extents_t, cuda::std::layout_right>; // row-major using mdspan_out_t = cuda::std::mdspan<float, extents_t, cuda::std::layout_left>; // column-major // Create mdspans with different layouts mdspan_in_t mdspan_in(thrust::raw_pointer_cast(d_input.data()), extents_t{}); mdspan_out_t mdspan_out(thrust::raw_pointer_cast(d_output.data()), extents_t{}); // Determine temporary device storage requirements void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; auto status = cub::DeviceCopy::Copy(d_temp_storage, temp_storage_bytes, mdspan_in, mdspan_out); check_status(status); // Allocate temporary storage using thrust::device_vector thrust::device_vector<char> temp_storage(temp_storage_bytes, thrust::no_init); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // Run copy algorithm status = cub::DeviceCopy::Copy(d_temp_storage, temp_storage_bytes, mdspan_in, mdspan_out); check_status(status);
- Template Parameters:
T_In – [inferred] The element type of the source mdspan
Extents_In – [inferred] The extents type of the source mdspan
Layout_In – [inferred] The layout type of the source mdspan
Accessor_In – [inferred] The accessor type of the source mdspan
T_Out – [inferred] The element type of the destination mdspan
Extents_Out – [inferred] The extents type of the destination mdspan
Layout_Out – [inferred] The layout type of the destination mdspan
Accessor_Out – [inferred] The accessor type of the destination mdspan
- Parameters:
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationmdspan_in – [in] Source mdspan containing the data to be copied
mdspan_out – [in] Destination mdspan where the data will be copied
stream – [in]
[optional] CUDA stream to launch kernels within. Default is stream0.
- Returns:
cudaSuccess on success, cudaErrorInvalidValue if mdspan extents don’t match, or error code on failure
-
template<typename InputIt, typename OutputIt, typename SizeIteratorT>