cuda::memcpy_async
Defined in header <cuda/barrier>
:
// (1)
template <typename Shape, cuda::thread_scope Scope, typename CompletionFunction>
__host__ __device__
void cuda::memcpy_async(void* destination, void const* source, Shape size,
cuda::barrier<Scope, CompletionFunction>& barrier);
// (2)
template <typename Group,
typename Shape, cuda::thread_scope Scope, typename CompletionFunction>
__host__ __device__
void cuda::memcpy_async(Group const& group,
void* destination, void const* source, Shape size,
cuda::barrier<Scope, CompletionFunction>& barrier);
Defined in header <cuda/pipeline>
:
// (3)
template <typename Shape, cuda::thread_scope Scope>
__host__ __device__
void cuda::memcpy_async(void* destination, void const* source, Shape size,
cuda::pipeline<Scope>& pipeline);
// (4)
template <typename Group, typename Shape, cuda::thread_scope Scope>
__host__ __device__
void cuda::memcpy_async(Group const& group,
void* destination, void const* source, Shape size,
cuda::pipeline<Scope>& pipeline);
Defined in header <cuda/annotated_ptr>
:
// (5)
template <typename Dst, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(Dst* dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
// (6)
template<typename Dst, typename DstProperty, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(cuda::annotated_ptr<Dst, DstProperty> dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
// (7)
template<typename Group, typename Dst, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(Group const& group, Dst* dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
// (8)
template<typename Group, typename Dst, typename DstProperty, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(Group const& group, cuda::annotated_ptr<Dst, DstProperty> dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
cuda::memcpy_async
asynchronously copies size
bytes from the memory location pointed to by source
to the memory location pointed to by destination
. Both objects are reinterpreted as arrays of unsigned char
.
- Binds the asynchronous copy completion to
cuda::barrier
and issues the copy in the current thread. - Binds the asynchronous copy completion to
cuda::barrier
and cooperatively issues the copy across all threads ingroup
. - Binds the asynchronous copy completion to
cuda::pipeline
and issues the copy in the current thread. - Binds the asynchronous copy completion to
cuda::pipeline
and cooperatively issues the copy across all threads ingroup
. - 5-8: convenience wrappers using
cuda::annotated_ptr
whereSync
is eithercuda::barrier
orcuda::pipeline
.
Notes
cuda::memcpy_async
have similar constraints to std::memcpy
, namely:
- If the objects overlap, the behavior is undefined.
- If either
destination
orsource
is an invalid or null pointer, the behavior is undefined (even ifcount
is zero). - If the objects are potentially-overlapping the behavior is undefined.
- If the objects are not of TriviallyCopyable type the program is ill-formed, no diagnostic required.
If Shape is cuda::aligned_size_t
, source
and destination
are both required to be aligned on cuda::aligned_size_t::align
, else the behavior is undefined.
If cuda::pipeline
is in a quitted state (see cuda::pipeline::quit
), the behavior is undefined.
For cooperative variants, if the parameters are not the same across all threads in group
, the behavior is undefined.
Template Parameters
Group | A type satisfying the [Group] concept. |
Shape | Either cuda::std::size_t or cuda::aligned_size_t . |
Parameters
group | The group of threads. |
destination | Pointer to the memory location to copy to. |
source | Pointer to the memory location to copy from. |
size | The number of bytes to copy. |
barrier | The barrier object used to wait on the copy completion. |
pipeline | The pipeline object used to wait on the copy completion. |
Examples
#include <cuda/barrier>
__global__ void example_kernel(char* dst, char* src) {
cuda::barrier<cuda::thread_scope_system> bar;
init(&bar, 1);
cuda::memcpy_async(dst, src, 1, bar);
cuda::memcpy_async(dst + 1, src + 8, 1, bar);
cuda::memcpy_async(dst + 2, src + 16, 1, bar);
cuda::memcpy_async(dst + 3, src + 24, 1, bar);
bar.arrive_and_wait();
}