cub::DeviceMemcpy#

struct DeviceMemcpy#

cub::DeviceMemcpy provides device-wide, parallel operations for copying data.

Tuning#

All algorithms in DeviceMemcpy that accept an environment can be tuned by passing a custom policy selector that returns a cub::BatchedCopyPolicy, as shown in the example below:

struct BatchedMemcpyPolicySelector
{
  __host__ __device__ constexpr auto operator()(cuda::compute_capability /*cc*/) const -> cub::BatchedCopyPolicy
  {
    return {
      .algorithm = cub::BatchedCopyAlgorithm::lookback,
      .lookback  = {
         .small_buffer = {.threads_per_block     = 128,
                          .buffers_per_thread    = 4,
                          .bytes_per_thread      = 8,
                          .prefer_pow2_bits      = false,
                          .block_level_tile_size = 256 * 32,
                          .warp_level_threshold  = 128,
                          .block_level_threshold = 8 * 1024,
                          .buffer_lookback_delay = {},
                          .block_lookback_delay  = {}},
         .large_buffer = {.threads_per_block = 256, .bytes_per_thread = 32}}};
  }
};

// Source data: 3 buffers laid out contiguously
// Buffer 0: [10, 20]     Buffer 1: [30, 40, 50]     Buffer 2: [60]
auto d_src = thrust::device_vector<int>{10, 20, 30, 40, 50, 60};

// Two separate destination buffers
auto d_dst_a = thrust::device_vector<int>(5, 0);
auto d_dst_b = thrust::device_vector<int>(1, 0);

auto d_src_ptrs = thrust::device_vector<const int*>{
  thrust::raw_pointer_cast(d_src.data()) + 0,
  thrust::raw_pointer_cast(d_src.data()) + 2,
  thrust::raw_pointer_cast(d_src.data()) + 5};

// Buffers 0,1 go to d_dst_a, buffer 2 goes to d_dst_b
auto d_dst_ptrs = thrust::device_vector<int*>{
  thrust::raw_pointer_cast(d_dst_a.data()) + 0,
  thrust::raw_pointer_cast(d_dst_a.data()) + 2,
  thrust::raw_pointer_cast(d_dst_b.data()) + 0};

auto d_sizes = thrust::device_vector<int>{
  2 * static_cast<int>(sizeof(int)), 3 * static_cast<int>(sizeof(int)), 1 * static_cast<int>(sizeof(int))};

const auto error = cub::DeviceMemcpy::Batched(
  thrust::raw_pointer_cast(d_src_ptrs.data()),
  thrust::raw_pointer_cast(d_dst_ptrs.data()),
  thrust::raw_pointer_cast(d_sizes.data()),
  3,
  cuda::execution::tune(BatchedMemcpyPolicySelector{}));
if (error != cudaSuccess)
{
  std::cerr << "cub::DeviceMemcpy::Batched failed with status: " << error << '\n';
}

thrust::device_vector<int> expected_a{10, 20, 30, 40, 50};
thrust::device_vector<int> expected_b{60};

Public Static Functions

template<typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT> static inline cudaError_t Batched( void *d_temp_storage, size_t &temp_storage_bytes, InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes, ::cuda::std::int64_t num_buffers, cudaStream_t stream = nullptr )#

Copies data from a batch of given source buffers to their corresponding destination buffer.

Added in version 2.2.0: First appears in CUDA Toolkit 12.3.

Snippet#

The code snippet below illustrates usage of DeviceMemcpy::Batched for mutating strings withing a single string buffer.

struct GetPtrToStringItem
{
  __host__ __device__ __forceinline__ void *operator()(uint32_t index)
  {
    return &d_string_data_in[d_string_offsets[index]];
  }
  char *d_string_data_in;
  uint32_t *d_string_offsets;
};

struct GetStringItemSize
{
  __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
  {
    return d_string_offsets[index + 1] - d_string_offsets[index];
  }
  uint32_t *d_string_offsets;
};

uint32_t num_strings = 5;
char *d_string_data_in;         // e.g., "TomatoesBananasApplesOrangesGrapes"
char *d_string_data_out;        // e.g., "                ...               "
uint32_t *d_string_offsets_old; // e.g., [0, 8, 15, 21, 28, 34]
uint32_t *d_string_offsets_new; // e.g., [0, 6, 13, 19, 26, 34]
uint32_t *d_gather_index;       // e.g., [2, 1, 4, 3, 0]

// Initialize an iterator that returns d_gather_index[i] when the i-th item is dereferenced
auto gather_iterator = thrust::make_permutation_iterator(thrust::make_counting_iterator(0),
d_gather_index);

// Returns pointers to the input buffer for each string
auto str_ptrs_in = thrust::make_transform_iterator(gather_iterator,
                                                   GetPtrToStringItem{d_string_data_in,
d_string_offsets_old});

// Returns the string size of the i-th string
auto str_sizes = thrust::make_transform_iterator(gather_iterator,
GetStringItemSize{d_string_offsets_old});

// Returns pointers to the output buffer for each string
auto str_ptrs_out = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
                                                    GetPtrToStringItem{d_string_data_out,
d_string_offsets_new});

// Determine temporary device storage requirements
void *d_temp_storage      = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
str_sizes, num_strings);

// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);

// Run batched copy algorithm (used to permute strings)
cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
str_sizes, num_strings);

// d_string_data_out       <-- "ApplesBananasGrapesOrangesTomatoe"

Note

If any input buffer aliases memory from any output buffer the behavior is undefined. If any output buffer aliases memory of another output buffer the behavior is undefined. Input buffers can alias one another.

Template Parameters:

InputBufferIt – [inferred] Device-accessible random-access input iterator type providing the pointers to the source memory buffers
OutputBufferIt – [inferred] Device-accessible random-access input iterator type providing the pointers to the destination memory buffers
BufferSizeIteratorT – [inferred] Device-accessible random-access input iterator type providing the number of bytes to be copied for each pair of buffers

Parameters:

d_temp_storage – [in] Temporary storage for this operation. If d_temp_storage is nullptr, the required size is written to temp_storage_bytes without dereferencing iterators or launching kernels. Otherwise, d_temp_storage must point to a device-accessible allocation of at least temp_storage_bytes bytes. No special alignment is required. See :ref:device-temp-storage for usage guidance.
temp_storage_bytes – [inout] Reference to size in bytes of d_temp_storage allocation
input_buffer_it – [in] Device-accessible iterator providing the pointers to the source memory buffers
output_buffer_it – [in] Device-accessible iterator providing the pointers to the destination memory buffers
buffer_sizes – [in] Device-accessible iterator providing the number of bytes to be copied for each pair of buffers
num_buffers – [in] The total number of buffer pairs
stream – [in]
[optional] CUDA stream to launch kernels within. Default is stream₀.

template<typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT, typename EnvT = ::cuda::std::execution::env<>, ::cuda::std::enable_if_t<!::cuda::std::is_same_v<InputBufferIt, void*>, int> = 0> static inline cudaError_t Batched( InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes, ::cuda::std::int64_t num_buffers, const EnvT &env = {} )#

Copies data from a batch of given source buffers to their corresponding destination buffer.

Added in version 3.4.0: First appears in CUDA Toolkit 13.4.

This is an environment-based API that allows customization of:

Stream: Query via cuda::get_stream
Memory resource: Query via cuda::mr::get_memory_resource

Snippet#

The code snippet below illustrates usage of DeviceMemcpy::Batched with an environment:

// Source data: 3 buffers of different sizes laid out contiguously
// Buffer 0: [10, 20]     Buffer 1: [30, 40, 50]     Buffer 2: [60]
auto d_src = thrust::device_vector<int>{10, 20, 30, 40, 50, 60};

// Copy into two separate destination buffers to highlight the API's flexibility
auto d_dst_a = thrust::device_vector<int>(5, 0);
auto d_dst_b = thrust::device_vector<int>(1, 0);

// Source pointers: one per buffer, pointing into d_src
auto d_src_ptrs = thrust::device_vector<const int*>{
  thrust::raw_pointer_cast(d_src.data()) + 0,
  thrust::raw_pointer_cast(d_src.data()) + 2,
  thrust::raw_pointer_cast(d_src.data()) + 5};

// Destination pointers: buffers 0,1 go to d_dst_a, buffer 2 goes to d_dst_b
auto d_dst_ptrs = thrust::device_vector<int*>{
  thrust::raw_pointer_cast(d_dst_a.data()) + 0,
  thrust::raw_pointer_cast(d_dst_a.data()) + 2,
  thrust::raw_pointer_cast(d_dst_b.data()) + 0};

// Sizes in bytes for each buffer
auto d_sizes = thrust::device_vector<int>{
  2 * static_cast<int>(sizeof(int)), 3 * static_cast<int>(sizeof(int)), 1 * static_cast<int>(sizeof(int))};

int num_buffers = 3;

cuda::stream stream{cuda::devices[0]};
cuda::stream_ref stream_ref{stream};

auto error = cub::DeviceMemcpy::Batched(
  thrust::raw_pointer_cast(d_src_ptrs.data()),
  thrust::raw_pointer_cast(d_dst_ptrs.data()),
  thrust::raw_pointer_cast(d_sizes.data()),
  num_buffers,
  stream_ref);
if (error != cudaSuccess)
{
  std::cerr << "cub::DeviceMemcpy::Batched failed with status: " << error << '\n';
}

thrust::device_vector<int> expected_a{10, 20, 30, 40, 50};
thrust::device_vector<int> expected_b{60};

Note

If any input buffer aliases memory from any output buffer the behavior is undefined. If any output buffer aliases memory of another output buffer the behavior is undefined. Input buffers can alias one another.

Template Parameters:

InputBufferIt – [inferred] Device-accessible random-access input iterator type providing the pointers to the source memory buffers
OutputBufferIt – [inferred] Device-accessible random-access input iterator type providing the pointers to the destination memory buffers
BufferSizeIteratorT – [inferred] Device-accessible random-access input iterator type providing the number of bytes to be copied for each pair of buffers
EnvT – [inferred] Environment type (e.g., cuda::std::execution::env<...>)

Parameters:

input_buffer_it – [in] Device-accessible iterator providing the pointers to the source memory buffers
output_buffer_it – [in] Device-accessible iterator providing the pointers to the destination memory buffers
buffer_sizes – [in] Device-accessible iterator providing the number of bytes to be copied for each pair of buffers
num_buffers – [in] The total number of buffer pairs
env – [in]
[optional] Execution environment. Default is cuda::std::execution::env{}.