cub::DeviceMemcpy#

struct DeviceMemcpy#

cub::DeviceMemcpy provides device-wide, parallel operations for copying data.

Public Static Functions

template<typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT>
static inline cudaError_t Batched(
void *d_temp_storage,
size_t &temp_storage_bytes,
InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
::cuda::std::int64_t num_buffers,
cudaStream_t stream = 0
)#

Copies data from a batch of given source buffers to their corresponding destination buffer.

Added in version 2.2.0: First appears in CUDA Toolkit 12.3.

Snippet#

The code snippet below illustrates usage of DeviceMemcpy::Batched for mutating strings withing a single string buffer.

struct GetPtrToStringItem
{
  __host__ __device__ __forceinline__ void *operator()(uint32_t index)
  {
    return &d_string_data_in[d_string_offsets[index]];
  }
  char *d_string_data_in;
  uint32_t *d_string_offsets;
};

struct GetStringItemSize
{
  __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
  {
    return d_string_offsets[index + 1] - d_string_offsets[index];
  }
  uint32_t *d_string_offsets;
};

uint32_t num_strings = 5;
char *d_string_data_in;         // e.g., "TomatoesBananasApplesOrangesGrapes"
char *d_string_data_out;        // e.g., "                ...               "
uint32_t *d_string_offsets_old; // e.g., [0, 8, 15, 21, 28, 34]
uint32_t *d_string_offsets_new; // e.g., [0, 6, 13, 19, 26, 34]
uint32_t *d_gather_index;       // e.g., [2, 1, 4, 3, 0]

// Initialize an iterator that returns d_gather_index[i] when the i-th item is dereferenced
auto gather_iterator = thrust::make_permutation_iterator(thrust::make_counting_iterator(0),
d_gather_index);

// Returns pointers to the input buffer for each string
auto str_ptrs_in = thrust::make_transform_iterator(gather_iterator,
                                                   GetPtrToStringItem{d_string_data_in,
d_string_offsets_old});

// Returns the string size of the i-th string
auto str_sizes = thrust::make_transform_iterator(gather_iterator,
GetStringItemSize{d_string_offsets_old});

// Returns pointers to the output buffer for each string
auto str_ptrs_out = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
                                                    GetPtrToStringItem{d_string_data_out,
d_string_offsets_new});

// Determine temporary device storage requirements
void *d_temp_storage      = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
str_sizes, num_strings);

// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);

// Run batched copy algorithm (used to permute strings)
cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
str_sizes, num_strings);

// d_string_data_out       <-- "ApplesBananasGrapesOrangesTomatoe"

Note

If any input buffer aliases memory from any output buffer the behavior is undefined. If any output buffer aliases memory of another output buffer the behavior is undefined. Input buffers can alias one another.

Template Parameters:
  • InputBufferIt[inferred] Device-accessible random-access input iterator type providing the pointers to the source memory buffers

  • OutputBufferIt[inferred] Device-accessible random-access input iterator type providing the pointers to the destination memory buffers

  • BufferSizeIteratorT[inferred] Device-accessible random-access input iterator type providing the number of bytes to be copied for each pair of buffers

Parameters:
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • input_buffer_it[in] Device-accessible iterator providing the pointers to the source memory buffers

  • output_buffer_it[in] Device-accessible iterator providing the pointers to the destination memory buffers

  • buffer_sizes[in] Device-accessible iterator providing the number of bytes to be copied for each pair of buffers

  • num_buffers[in] The total number of buffer pairs

  • stream[in]

    [optional] CUDA stream to launch kernels within. Default is stream0.

template<typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT, typename EnvT = ::cuda::std::execution::env<>, ::cuda::std::enable_if_t<!::cuda::std::is_same_v<InputBufferIt, void*>, int> = 0>
static inline cudaError_t Batched(
InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
::cuda::std::int64_t num_buffers,
EnvT env = {}
)#

Copies data from a batch of given source buffers to their corresponding destination buffer.

Added in version 3.4.0: First appears in CUDA Toolkit 13.4.

This is an environment-based API that allows customization of:

  • Stream: Query via cuda::get_stream

  • Memory resource: Query via cuda::mr::get_memory_resource

Snippet#

The code snippet below illustrates usage of DeviceMemcpy::Batched with an environment:

// Source data: 3 buffers of different sizes laid out contiguously
// Buffer 0: [10, 20]     Buffer 1: [30, 40, 50]     Buffer 2: [60]
auto d_src = thrust::device_vector<int>{10, 20, 30, 40, 50, 60};

// Copy into two separate destination buffers to highlight the API's flexibility
auto d_dst_a = thrust::device_vector<int>(5, 0);
auto d_dst_b = thrust::device_vector<int>(1, 0);

// Source pointers: one per buffer, pointing into d_src
auto d_src_ptrs = thrust::device_vector<const int*>{
  thrust::raw_pointer_cast(d_src.data()) + 0,
  thrust::raw_pointer_cast(d_src.data()) + 2,
  thrust::raw_pointer_cast(d_src.data()) + 5};

// Destination pointers: buffers 0,1 go to d_dst_a, buffer 2 goes to d_dst_b
auto d_dst_ptrs = thrust::device_vector<int*>{
  thrust::raw_pointer_cast(d_dst_a.data()) + 0,
  thrust::raw_pointer_cast(d_dst_a.data()) + 2,
  thrust::raw_pointer_cast(d_dst_b.data()) + 0};

// Sizes in bytes for each buffer
auto d_sizes = thrust::device_vector<int>{
  2 * static_cast<int>(sizeof(int)), 3 * static_cast<int>(sizeof(int)), 1 * static_cast<int>(sizeof(int))};

int num_buffers = 3;

cuda::stream stream{cuda::devices[0]};
cuda::stream_ref stream_ref{stream};
auto env = cuda::std::execution::env{stream_ref};

auto error = cub::DeviceMemcpy::Batched(
  thrust::raw_pointer_cast(d_src_ptrs.data()),
  thrust::raw_pointer_cast(d_dst_ptrs.data()),
  thrust::raw_pointer_cast(d_sizes.data()),
  num_buffers,
  env);
if (error != cudaSuccess)
{
  std::cerr << "cub::DeviceMemcpy::Batched failed with status: " << error << std::endl;
}

thrust::device_vector<int> expected_a{10, 20, 30, 40, 50};
thrust::device_vector<int> expected_b{60};

Note

If any input buffer aliases memory from any output buffer the behavior is undefined. If any output buffer aliases memory of another output buffer the behavior is undefined. Input buffers can alias one another.

Template Parameters:
  • InputBufferIt[inferred] Device-accessible random-access input iterator type providing the pointers to the source memory buffers

  • OutputBufferIt[inferred] Device-accessible random-access input iterator type providing the pointers to the destination memory buffers

  • BufferSizeIteratorT[inferred] Device-accessible random-access input iterator type providing the number of bytes to be copied for each pair of buffers

  • EnvT[inferred] Environment type (e.g., cuda::std::execution::env<...>)

Parameters:
  • input_buffer_it[in] Device-accessible iterator providing the pointers to the source memory buffers

  • output_buffer_it[in] Device-accessible iterator providing the pointers to the destination memory buffers

  • buffer_sizes[in] Device-accessible iterator providing the number of bytes to be copied for each pair of buffers

  • num_buffers[in] The total number of buffer pairs

  • env[in]

    [optional] Execution environment. Default is cuda::std::execution::env{}.