cuda::experimental::copy_bytes#
Overloads#
copy_bytes(__src, __dst, __stream)#
-
template<typename _TpIn, typename _ExtentsIn, typename _LayoutPolicyIn, typename _AccessorPolicyIn, typename _TpOut, typename _ExtentsOut, typename _LayoutPolicyOut, typename _AccessorPolicyOut>
inline void cuda::experimental::copy_bytes( - ::cuda::host_mdspan<_TpIn, _ExtentsIn, _LayoutPolicyIn, _AccessorPolicyIn> __src,
- ::cuda::device_mdspan<_TpOut, _ExtentsOut, _LayoutPolicyOut, _AccessorPolicyOut> __dst,
- ::cuda::stream_ref __stream
Asynchronous byte-wise mdspan copy#
copy_bytesasynchronously copies elements between a hostmdspanand a devicemdspanon the given CUDA stream. Two overloads are provided: host-to-device and device-to-host.Source and destination must have the same total number of elements and identical extents (after removing extent-1 dimensions).
The implementation supports any stride value independently for source and destination mdspans.
Element types must be trivially copyable and (ignoring cv-qualification) the same type.
Layout policies must be one of the predefined
cuda::stdlayout policies (layout_right,layout_left,layout_stride) orcuda::layout_stride_relaxed.Accessor policies must be convertible to
cuda::std::default_accessor.The destination must not have an interleaved stride order.
The implementation is optimized to maximize the contiguous memory regions to copy and relies on batched asynchronous memcpy.
#include <cuda/experimental/copy_bytes.cuh> using extents_t = cuda::std::dims<2>; cuda::host_mdspan<const float, extents_t> src(src_ptr, extents); cuda::device_mdspan<float, extents_t> dst(dst_ptr, extents); cuda::experimental::copy_bytes(src, dst, stream);
- Parameters:
__src – [in] Source mdspan
__dst – [out] Destination mdspan
__stream – [in] CUDA stream for the asynchronous transfer
copy_bytes(__src, __dst, __stream)#
-
template<typename _TpIn, typename _ExtentsIn, typename _LayoutPolicyIn, typename _AccessorPolicyIn, typename _TpOut, typename _ExtentsOut, typename _LayoutPolicyOut, typename _AccessorPolicyOut>
inline void cuda::experimental::copy_bytes( - ::cuda::device_mdspan<_TpIn, _ExtentsIn, _LayoutPolicyIn, _AccessorPolicyIn> __src,
- ::cuda::host_mdspan<_TpOut, _ExtentsOut, _LayoutPolicyOut, _AccessorPolicyOut> __dst,
- ::cuda::stream_ref __stream
Asynchronously copies bytes from a device mdspan to a host mdspan.
- Parameters:
__src – [in] Source device mdspan
__dst – [out] Destination host mdspan
__stream – [in] CUDA stream for the asynchronous transfer