Algorithm#
The runtime part of the cuda/algorithm header provides stream-ordered, byte-wise primitives that operate on cuda::std::span and
cuda::std::mdspan-compatible types. They require a cuda::stream_ref to enqueue work.
cuda::copy_bytes#
Launch a byte-wise copy from source to destination on the provided stream.
Signature:
copy_bytes(stream, src, dst, config = {})Overloads accept
cuda::std::span-convertible contiguous ranges orcuda::std::mdspan-convertible multi-dimensional views.Elements must be trivially copyable
cuda::std::mdspan-convertible types must convert to an mdspan that is exhaustiveThe optional
configargument is acuda::copy_configurationthat controls source access order and managed-memory location hints
Availability: CCCL 3.1.0 / CUDA 13.1
#include <cuda/algorithm>
#include <cuda/stream>
#include <cuda/std/algorithm>
#include <cuda/std/span>
void copy_example(cuda::stream_ref s, cuda::std::span<const int> src, cuda::std::span<int> dst) {
// copy_bytes copies up to src.size_bytes(); dst can be larger.
auto n = cuda::std::min(src.size(), dst.size());
auto src_prefix = src.first(n / 2);
auto dst_prefix = dst.first(n / 2);
auto src_suffix = src.subspan(n / 2);
auto dst_suffix = dst.subspan(n / 2);
// Default behavior: enqueue a stream-ordered byte-wise copy on stream s.
cuda::copy_bytes(s, src_prefix, dst_prefix);
// Advanced behavior: customize source access order for this copy.
auto config = cuda::copy_configuration{
.src_access_order = cuda::source_access_order::during_api_call,
};
cuda::copy_bytes(s, src_suffix, dst_suffix, config);
}
cuda::fill_bytes#
Launch a byte-wise fill of the destination on the provided stream.
Overloads accept
cuda::std::span-convertible orcuda::std::mdspan-convertible destinations.Elements must be trivially copyable
cuda::std::mdspan-convertible types must convert to an mdspan that is exhaustive
Availability: CCCL 3.1.0 / CUDA 13.1
#include <cuda/algorithm>
#include <cuda/stream>
#include <cuda/std/algorithm>
#include <cuda/std/span>
void fill_example(cuda::stream_ref s, cuda::std::span<unsigned char> dst) {
// Reserve 16-byte red zones at both ends and clear the payload in between.
auto guard = cuda::std::min(static_cast<decltype(dst.size())>(16), dst.size() / 2);
auto head = dst.first(guard);
auto body = dst.subspan(guard, dst.size() - 2 * guard);
auto tail = dst.last(guard);
cuda::fill_bytes(s, head, 0xCD); // debug guard pattern
cuda::fill_bytes(s, body, 0x00); // initialize payload
cuda::fill_bytes(s, tail, 0xCD); // debug guard pattern
}