cub::DeviceCopy#

struct DeviceCopy#

cub::DeviceCopy provides device-wide, parallel operations for copying data.

Public Static Functions

template<typename InputIt, typename OutputIt, typename SizeIteratorT>
static inline cudaError_t Batched(
void *d_temp_storage,
size_t &temp_storage_bytes,
InputIt input_it,
OutputIt output_it,
SizeIteratorT sizes,
::cuda::std::int64_t num_ranges,
cudaStream_t stream = nullptr,
)#

Copies data from a batch of given source ranges to their corresponding destination ranges.

Snippet#

The code snippet below illustrates usage of DeviceCopy::Batched to perform a DeviceRunLength Decode operation.

struct GetIteratorToRange
{
  __host__ __device__ __forceinline__ auto operator()(uint32_t index)
  {
    return thrust::make_constant_iterator(d_data_in[index]);
  }
  int32_t *d_data_in;
};

struct GetPtrToRange
{
  __host__ __device__ __forceinline__ auto operator()(uint32_t index)
  {
    return d_data_out + d_offsets[index];
  }
  int32_t *d_data_out;
  uint32_t *d_offsets;
};

struct GetRunLength
{
  __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
  {
    return d_offsets[index + 1] - d_offsets[index];
  }
  uint32_t *d_offsets;
};

uint32_t num_ranges = 5;
int32_t *d_data_in;           // e.g., [4, 2, 7, 3, 1]
int32_t *d_data_out;          // e.g., [0,                ...               ]
uint32_t *d_offsets;          // e.g., [0, 2, 5, 6, 9, 14]

// Returns a constant iterator to the element of the i-th run
thrust::counting_iterator<uint32_t> iota(0);
auto iterators_in = thrust::make_transform_iterator(iota, GetIteratorToRange{d_data_in});

// Returns the run length of the i-th run
auto sizes = thrust::make_transform_iterator(iota, GetRunLength{d_offsets});

// Returns pointers to the output range for each run
auto ptrs_out = thrust::make_transform_iterator(iota, GetPtrToRange{d_data_out, d_offsets});

// Determine temporary device storage requirements
void *d_temp_storage      = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes,
num_ranges);

// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);

// Run batched copy algorithm (used to perform runlength decoding)
cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes,
num_ranges);

// d_data_out       <-- [4, 4, 2, 2, 2, 7, 3, 3, 3, 1, 1, 1, 1, 1]

Note

If any input range aliases any output range the behavior is undefined. If any output range aliases another output range the behavior is undefined. Input ranges can alias one another.

Template Parameters:
  • InputIt[inferred] Device-accessible random-access input iterator type providing the iterators to the source ranges

  • OutputIt[inferred] Device-accessible random-access input iterator type providing the iterators to the destination ranges

  • SizeIteratorT[inferred] Device-accessible random-access input iterator type providing the number of items to be copied for each pair of ranges

Parameters:
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • input_it[in] Device-accessible iterator providing the iterators to the source ranges

  • output_it[in] Device-accessible iterator providing the iterators to the destination ranges

  • sizes[in] Device-accessible iterator providing the number of elements to be copied for each pair of ranges

  • num_ranges[in] The total number of range pairs

  • stream[in]

    [optional] CUDA stream to launch kernels within. Default is stream0.

template<typename T_In, typename Extents_In, typename Layout_In, typename Accessor_In, typename T_Out, typename Extents_Out, typename Layout_Out, typename Accessor_Out>
static inline cudaError_t Copy(
void *d_temp_storage,
size_t &temp_storage_bytes,
::cuda::std::mdspan<T_In, Extents_In, Layout_In, Accessor_In> mdspan_in,
::cuda::std::mdspan<T_Out, Extents_Out, Layout_Out, Accessor_Out> mdspan_out,
::cudaStream_t stream = nullptr,
)#

Copies data from a multidimensional source mdspan to a destination mdspan.

This function performs a parallel copy operation between two mdspan objects with potentially different layouts but identical extents. The copy operation handles arbitrary-dimensional arrays and automatically manages layout transformations.

Preconditions#

  • The source and destination mdspans must have identical extents (same ranks and sizes).

  • The source and destination mdspans data handle must not be nullptr if the size is not 0.

  • The underlying memory of the source and destination must not overlap.

  • Both mdspans must point to device memory.

Snippet#

The code snippet below illustrates usage of DeviceCopy::Copy to copy between mdspans.

// Example: Copy a 2D array from row-major to column-major layout
constexpr int N = 10;
constexpr int M = 8;

// Allocate device memory using thrust::device_vector
thrust::device_vector<float> d_input(N * M);
thrust::device_vector<float> d_output(N * M, thrust::no_init);

using extents_t    = cuda::std::extents<int, N, M>;
using mdspan_in_t  = cuda::std::mdspan<float, extents_t, cuda::std::layout_right>; // row-major
using mdspan_out_t = cuda::std::mdspan<float, extents_t, cuda::std::layout_left>; // column-major
// Create mdspans with different layouts
mdspan_in_t mdspan_in(thrust::raw_pointer_cast(d_input.data()), extents_t{});
mdspan_out_t mdspan_out(thrust::raw_pointer_cast(d_output.data()), extents_t{});

// Determine temporary device storage requirements
void*  d_temp_storage     = nullptr;
size_t temp_storage_bytes = 0;
auto status = cub::DeviceCopy::Copy(d_temp_storage, temp_storage_bytes, mdspan_in, mdspan_out);
check_status(status);

// Allocate temporary storage using thrust::device_vector
thrust::device_vector<char> temp_storage(temp_storage_bytes, thrust::no_init);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// Run copy algorithm
status = cub::DeviceCopy::Copy(d_temp_storage, temp_storage_bytes, mdspan_in, mdspan_out);
check_status(status);

Template Parameters:
  • T_In[inferred] The element type of the source mdspan

  • Extents_In[inferred] The extents type of the source mdspan

  • Layout_In[inferred] The layout type of the source mdspan

  • Accessor_In[inferred] The accessor type of the source mdspan

  • T_Out[inferred] The element type of the destination mdspan

  • Extents_Out[inferred] The extents type of the destination mdspan

  • Layout_Out[inferred] The layout type of the destination mdspan

  • Accessor_Out[inferred] The accessor type of the destination mdspan

Parameters:
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • mdspan_in[in] Source mdspan containing the data to be copied

  • mdspan_out[in] Destination mdspan where the data will be copied

  • stream[in]

    [optional] CUDA stream to launch kernels within. Default is stream0.

Returns:

cudaSuccess on success, cudaErrorInvalidValue if mdspan extents don’t match, or error code on failure