cuda::experimental::copy_bytes#

Overloads#

`copy_bytes(src, dst, __stream)`#

template<typename _TpIn, typename _ExtentsIn, typename _LayoutPolicyIn, typename _AccessorPolicyIn, typename _TpOut, typename _ExtentsOut, typename _LayoutPolicyOut, typename _AccessorPolicyOut> inline void cuda::experimental::copy_bytes( ::cuda::host_mdspan<_TpIn, _ExtentsIn, _LayoutPolicyIn, _AccessorPolicyIn> __src, ::cuda::device_mdspan<_TpOut, _ExtentsOut, _LayoutPolicyOut, _AccessorPolicyOut> __dst, ::cuda::stream_ref __stream )

Asynchronous byte-wise mdspan copy#

copy_bytes asynchronously copies elements between a host mdspan and a device mdspan on the given CUDA stream. Two overloads are provided: host-to-device and device-to-host.

Source and destination must have the same total number of elements and identical extents (after removing extent-1 dimensions).
The implementation supports any stride value independently for source and destination mdspans.
Element types must be trivially copyable and (ignoring cv-qualification) the same type.
Layout policies must be one of the predefined cuda::std layout policies (layout_right, layout_left, layout_stride) or cuda::layout_stride_relaxed.
Accessor policies must be convertible to cuda::std::default_accessor.
The destination must not have an interleaved stride order.

The implementation is optimized to maximize the contiguous memory regions to copy and relies on batched asynchronous memcpy.

#include <cuda/experimental/copy_bytes.cuh>

  using extents_t = cuda::std::dims<2>;
  cuda::host_mdspan<const float, extents_t> src(src_ptr, extents);
  cuda::device_mdspan<float, extents_t>     dst(dst_ptr, extents);
  cuda::experimental::copy_bytes(src, dst, stream);

Parameters:

__src – [in] Source mdspan
__dst – [out] Destination mdspan
__stream – [in] CUDA stream for the asynchronous transfer

`copy_bytes(src, dst, __stream)`#

template<typename _TpIn, typename _ExtentsIn, typename _LayoutPolicyIn, typename _AccessorPolicyIn, typename _TpOut, typename _ExtentsOut, typename _LayoutPolicyOut, typename _AccessorPolicyOut> inline void cuda::experimental::copy_bytes( ::cuda::device_mdspan<_TpIn, _ExtentsIn, _LayoutPolicyIn, _AccessorPolicyIn> __src, ::cuda::host_mdspan<_TpOut, _ExtentsOut, _LayoutPolicyOut, _AccessorPolicyOut> __dst, ::cuda::stream_ref __stream )

Asynchronously copies bytes from a device mdspan to a host mdspan.

Parameters:

__src – [in] Source device mdspan
__dst – [out] Destination host mdspan
__stream – [in] CUDA stream for the asynchronous transfer

`copy_bytes(pb, src, __dst)`#

template<typename _SrcTy, typename _DstTy> inline graph_node_ref cuda::experimental::copy_bytes( path_builder &__pb, _SrcTy &&__src, _DstTy &&__dst )

Adds a memcpy node to a CUDA graph path that copies bytes from source to destination.

Adds a memcpy node for mdspan source and destination.

cuda::experimental::copy_bytes#

Overloads#

copy_bytes(__src, __dst, __stream)#

Asynchronous byte-wise mdspan copy#

copy_bytes(__src, __dst, __stream)#

copy_bytes(__pb, __src, __dst)#

`copy_bytes(src, dst, __stream)`#

`copy_bytes(src, dst, __stream)`#

`copy_bytes(pb, src, __dst)`#