cuda::memcpy_async
Defined in header <cuda/barrier>
:
// (1)
template <typename Shape, cuda::thread_scope Scope, typename CompletionFunction>
__host__ __device__
void cuda::memcpy_async(void* destination, void const* source, Shape size,
cuda::barrier<Scope, CompletionFunction>& barrier);
// (2)
template <typename Group,
typename Shape, cuda::thread_scope Scope, typename CompletionFunction>
__host__ __device__
void cuda::memcpy_async(Group const& group,
void* destination, void const* source, Shape size,
cuda::barrier<Scope, CompletionFunction>& barrier);
Defined in header <cuda/pipeline>
:
// (3)
template <typename Shape, cuda::thread_scope Scope>
__host__ __device__
void cuda::memcpy_async(void* destination, void const* source, Shape size,
cuda::pipeline<Scope>& pipeline);
// (4)
template <typename Group, typename Shape, cuda::thread_scope Scope>
__host__ __device__
void cuda::memcpy_async(Group const& group,
void* destination, void const* source, Shape size,
cuda::pipeline<Scope>& pipeline);
Defined in header <cuda/annotated_ptr>
:
// (5)
template <typename Dst, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(Dst* dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
// (6)
template<typename Dst, typename DstProperty, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(cuda::annotated_ptr<Dst, DstProperty> dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
// (7)
template<typename Group, typename Dst, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(Group const& group, Dst* dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
// (8)
template<typename Group, typename Dst, typename DstProperty, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(Group const& group, cuda::annotated_ptr<Dst, DstProperty> dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
cuda::memcpy_async
asynchronously copies size
bytes from the
memory location pointed to by source
to the memory location pointed
to by destination
. Both objects are reinterpreted as arrays of
unsigned char
.
Binds the asynchronous copy completion to
cuda::barrier
and issues the copy in the current thread.Binds the asynchronous copy completion to
cuda::barrier
and cooperatively issues the copy across all threads ingroup
.Binds the asynchronous copy completion to
cuda::pipeline
and issues the copy in the current thread.Binds the asynchronous copy completion to
cuda::pipeline
and cooperatively issues the copy across all threads ingroup
.5-8: convenience wrappers using
cuda::annotated_ptr
whereSync
is eithercuda::barrier
orcuda::pipeline
.
Notes
cuda::memcpy_async
have similar constraints to std::memcpy,
namely:
If the objects overlap, the behavior is undefined.
If either
destination
orsource
is an invalid or null pointer, the behavior is undefined (even ifcount
is zero).If the objects are potentially-overlapping the behavior is undefined.
If the objects are not of TriviallyCopyable type the program is ill-formed, no diagnostic required.
If Shape is cuda::aligned_size_t,
source
anddestination
are both required to be aligned oncuda::aligned_size_t::align
, else the behavior is undefined.If
cuda::pipeline
is in a quitted state (see cuda::pipeline::quit), the behavior is undefined.For cooperative variants, if the parameters are not the same across all threads in
group
, the behavior is undefined.
Template Parameters
|
A type satisfying the [Group] concept. |
|
Either cuda::std::size_t or cuda::aligned_size_t. |
Parameters
|
The group of threads. |
|
Pointer to the memory location to copy to. |
|
Pointer to the memory location to copy from. |
|
The number of bytes to copy. |
|
The barrier object used to wait on the copy completion. |
|
The pipeline object used to wait on the copy completion. |
Examples
#include <cuda/barrier>
__global__ void example_kernel(char* dst, char* src) {
cuda::barrier<cuda::thread_scope_system> bar;
init(&bar, 1);
cuda::memcpy_async(dst, src, 1, bar);
cuda::memcpy_async(dst + 1, src + 8, 1, bar);
cuda::memcpy_async(dst + 2, src + 16, 1, bar);
cuda::memcpy_async(dst + 3, src + 24, 1, bar);
bar.arrive_and_wait();
}