include/cuda/experimental/__container/async_buffer.cuh
File members: include/cuda/experimental/__container/async_buffer.cuh
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
#ifndef __CUDAX__CONTAINER_ASYNC_BUFFER__
#define __CUDAX__CONTAINER_ASYNC_BUFFER__
#include <cuda/std/detail/__config>
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header
#include <thrust/copy.h>
#include <thrust/equal.h>
#include <thrust/execution_policy.h>
#include <thrust/fill.h>
#include <thrust/for_each.h>
#include <cuda/__memory_resource/properties.h>
#include <cuda/__memory_resource/resource_ref.h>
#include <cuda/std/__algorithm/copy.h>
#include <cuda/std/__algorithm/equal.h>
#include <cuda/std/__algorithm/fill.h>
#include <cuda/std/__algorithm/lexicographical_compare.h>
#include <cuda/std/__algorithm/move_backward.h>
#include <cuda/std/__algorithm/rotate.h>
#include <cuda/std/__iterator/concepts.h>
#include <cuda/std/__iterator/distance.h>
#include <cuda/std/__iterator/iter_move.h>
#include <cuda/std/__iterator/reverse_iterator.h>
#include <cuda/std/__memory/construct_at.h>
#include <cuda/std/__memory/temporary_buffer.h>
#include <cuda/std/__memory/uninitialized_algorithms.h>
#include <cuda/std/__ranges/access.h>
#include <cuda/std/__ranges/concepts.h>
#include <cuda/std/__ranges/size.h>
#include <cuda/std/__ranges/unwrap_end.h>
#include <cuda/std/__type_traits/is_nothrow_move_assignable.h>
#include <cuda/std/__type_traits/is_swappable.h>
#include <cuda/std/__type_traits/is_trivially_copyable.h>
#include <cuda/std/__utility/forward.h>
#include <cuda/std/__utility/move.h>
#include <cuda/std/cstdint>
#include <cuda/std/detail/libcxx/include/stdexcept>
#include <cuda/std/initializer_list>
#include <cuda/std/limits>
#include <cuda/experimental/__container/heterogeneous_iterator.cuh>
#include <cuda/experimental/__container/uninitialized_async_buffer.cuh>
#include <cuda/experimental/__detail/utility.cuh>
#include <cuda/experimental/__execution/env.cuh>
#include <cuda/experimental/__execution/policy.cuh>
#include <cuda/experimental/__launch/host_launch.cuh>
#include <cuda/experimental/__memory_resource/any_resource.cuh>
#include <cuda/experimental/__memory_resource/get_memory_resource.cuh>
#include <cuda/experimental/__memory_resource/properties.cuh>
#include <cuda/experimental/__stream/get_stream.cuh>
#include <cuda/experimental/__utility/ensure_current_device.cuh>
#include <cuda/experimental/__utility/select_execution_space.cuh>
_CCCL_PUSH_MACROS
namespace cuda::experimental
{
template <class _Tp, class... _Properties>
class async_buffer
{
public:
using value_type = _Tp;
using reference = _Tp&;
using const_reference = const _Tp&;
using pointer = _Tp*;
using const_pointer = const _Tp*;
using iterator = heterogeneous_iterator<_Tp, _Properties...>;
using const_iterator = heterogeneous_iterator<const _Tp, _Properties...>;
using reverse_iterator = _CUDA_VSTD::reverse_iterator<iterator>;
using const_reverse_iterator = _CUDA_VSTD::reverse_iterator<const_iterator>;
using size_type = _CUDA_VSTD::size_t;
using difference_type = _CUDA_VSTD::ptrdiff_t;
using __env_t = ::cuda::experimental::env_t<_Properties...>;
using __policy_t = ::cuda::experimental::execution::execution_policy;
using __buffer_t = ::cuda::experimental::uninitialized_async_buffer<_Tp, _Properties...>;
using __resource_t = ::cuda::experimental::any_async_resource<_Properties...>;
using __resource_ref_t = _CUDA_VMR::async_resource_ref<_Properties...>;
template <class, class...>
friend class async_buffer;
// For now we require trivially copyable type to simplify the implementation
static_assert(_CCCL_TRAIT(_CUDA_VSTD::is_trivially_copyable, _Tp),
"cuda::experimental::async_buffer requires T to be trivially copyable.");
// At least one of the properties must signal an execution space
static_assert(_CUDA_VMR::__contains_execution_space_property<_Properties...>,
"The properties of cuda::experimental::async_buffer must contain at least one execution space "
"property!");
static constexpr bool __is_host_only = __select_execution_space<_Properties...> == _ExecutionSpace::__host;
private:
__buffer_t __buf_;
size_type __size_ = 0; // initialized to 0 in case initialization of the elements might throw
__policy_t __policy_ = __policy_t::invalid_execution_policy;
template <class _Range>
static constexpr bool __compatible_range = _CUDA_VRANGES::__container_compatible_range<_Range, _Tp>;
template <class... _OtherProperties>
static constexpr bool __properties_match =
_CUDA_VSTD::__type_set_contains_v<_CUDA_VSTD::__make_type_set<_OtherProperties...>, _Properties...>;
template <class... _OtherProperties>
static constexpr cudaMemcpyKind __transfer_kind =
__select_execution_space<_OtherProperties...> == _ExecutionSpace::__host
? (__is_host_only ? cudaMemcpyHostToHost : cudaMemcpyHostToDevice)
: (__is_host_only ? cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice);
__resource_ref_t __borrow_resource() const noexcept
{
return const_cast<__resource_t&>(__buf_.get_memory_resource());
}
template <class _Iter>
_CCCL_HIDE_FROM_ABI void __assign_impl(const size_type __count, _Iter __first, _Iter __last)
{
if (__size_ < __count)
{
(void) __buf_.__replace_allocation(__count);
}
this->__copy_cross<_Iter>(__first, __last, __unwrapped_begin(), __count);
__size_ = __count;
}
template <class _Iter>
_CCCL_HIDE_FROM_ABI void __copy_cross(_Iter __first, [[maybe_unused]] _Iter __last, pointer __dest, size_type __count)
{
if (__count == 0)
{
return;
}
if constexpr (!_CUDA_VSTD::contiguous_iterator<_Iter>)
{ // For non-coniguous iterators we need to copy into temporary host storage to use cudaMemcpy
// Currently only supported from host because no one should use non-contiguous data on device
auto __temp = _CUDA_VSTD::get_temporary_buffer<_Tp>(__count).first;
::cuda::experimental::host_launch(__buf_.get_stream(), _CUDA_VSTD::copy<_Iter, pointer>, __first, __last, __temp);
// FIXME: Something is fishy here. We need to wait otherwise the data is not properly set.
// The test passes fine with compute-sanitizer but we really do not want to take the performance hit for this.
// See https://github.com/NVIDIA/cccl/issues/3814
__buf_.get_stream().wait();
_CCCL_TRY_CUDA_API(
::cudaMemcpyAsync,
"cudax::async_buffer::__copy_cross: failed to copy data",
__dest,
__temp,
sizeof(_Tp) * __count,
::cudaMemcpyDefault,
__buf_.get_stream().get());
// We need to free the temporary buffer in stream order to ensure the memory survives
::cuda::experimental::host_launch(__buf_.get_stream(), _CUDA_VSTD::return_temporary_buffer<_Tp>, __temp);
}
else
{
_CCCL_TRY_CUDA_API(
::cudaMemcpyAsync,
"cudax::async_buffer::__copy_cross: failed to copy data",
__dest,
_CUDA_VSTD::to_address(__first),
sizeof(_Tp) * __count,
::cudaMemcpyDefault,
__buf_.get_stream().get());
}
}
_CCCL_HIDE_FROM_ABI void __value_initialize_n(pointer __first, size_type __count)
{
if (__count == 0)
{
return;
}
if constexpr (__is_host_only)
{
::cuda::experimental::host_launch(
__buf_.get_stream(), _CUDA_VSTD::uninitialized_value_construct_n<pointer, size_type>, __first, __count);
}
else
{
::cuda::experimental::__ensure_current_device __guard(__buf_.get_stream());
thrust::fill_n(thrust::cuda::par_nosync.on(__buf_.get_stream()), __first, __count, _Tp());
}
}
_CCCL_HIDE_FROM_ABI void __fill_n(pointer __first, size_type __count, const _Tp& __value)
{
if (__count == 0)
{
return;
}
if constexpr (__is_host_only)
{
::cuda::experimental::host_launch(
__buf_.get_stream(), _CUDA_VSTD::uninitialized_fill_n<pointer, size_type, _Tp>, __first, __count, __value);
}
else
{
::cuda::experimental::__ensure_current_device __guard(__buf_.get_stream());
thrust::fill_n(thrust::cuda::par_nosync.on(__buf_.get_stream()), __first, __count, __value);
}
}
public:
_CCCL_HIDE_FROM_ABI async_buffer(const async_buffer& __other)
: __buf_(__other.get_memory_resource(), __other.get_stream(), __other.__size_)
, __size_(__other.__size_)
, __policy_(__other.__policy_)
{
this->__copy_cross<const_pointer>(
__other.__unwrapped_begin(), __other.__unwrapped_end(), __unwrapped_begin(), __other.__size_);
}
_CCCL_HIDE_FROM_ABI async_buffer(async_buffer&& __other) noexcept
: __buf_(_CUDA_VSTD::move(__other.__buf_))
, __size_(_CUDA_VSTD::exchange(__other.__size_, 0))
, __policy_(_CUDA_VSTD::exchange(__other.__policy_, __policy_t::invalid_execution_policy))
{}
_CCCL_TEMPLATE(class... _OtherProperties)
_CCCL_REQUIRES(__properties_match<_OtherProperties...>)
_CCCL_HIDE_FROM_ABI explicit async_buffer(const async_buffer<_Tp, _OtherProperties...>& __other)
: __buf_(__other.get_memory_resource(), __other.get_stream(), __other.__size_)
, __size_(__other.__size_)
, __policy_(__other.__policy_)
{
this->__copy_cross<const_pointer>(
__other.__unwrapped_begin(), __other.__unwrapped_end(), __unwrapped_begin(), __other.__size_);
}
_CCCL_TEMPLATE(class... _OtherProperties)
_CCCL_REQUIRES(__properties_match<_OtherProperties...>)
_CCCL_HIDE_FROM_ABI explicit async_buffer(async_buffer<_Tp, _OtherProperties...>&& __other) noexcept
: __buf_(_CUDA_VSTD::move(__other.__buf_))
, __size_(_CUDA_VSTD::exchange(__other.__size_, 0))
, __policy_(_CUDA_VSTD::exchange(__other.__policy_, __policy_t::invalid_execution_policy))
{}
_CCCL_HIDE_FROM_ABI async_buffer(const __env_t& __env)
: async_buffer(__env, 0, ::cuda::experimental::uninit)
{}
_CCCL_HIDE_FROM_ABI explicit async_buffer(const __env_t& __env, const size_type __size)
: async_buffer(__env, __size, ::cuda::experimental::uninit)
{
this->__value_initialize_n(__unwrapped_begin(), __size);
}
_CCCL_HIDE_FROM_ABI explicit async_buffer(const __env_t& __env, const size_type __size, const _Tp& __value)
: async_buffer(__env, __size, ::cuda::experimental::uninit)
{
this->__fill_n(__unwrapped_begin(), __size, __value);
}
_CCCL_HIDE_FROM_ABI explicit async_buffer(const __env_t& __env, const size_type __size, ::cuda::experimental::uninit_t)
: __buf_(::cuda::experimental::get_memory_resource(__env), ::cuda::experimental::get_stream(__env), __size)
, __size_(__size)
, __policy_(__env.query(::cuda::experimental::execution::get_execution_policy))
{}
_CCCL_TEMPLATE(class _Iter)
_CCCL_REQUIRES(_CUDA_VSTD::__is_cpp17_forward_iterator<_Iter>::value)
_CCCL_HIDE_FROM_ABI async_buffer(const __env_t& __env, _Iter __first, _Iter __last)
: async_buffer(__env, static_cast<size_type>(_CUDA_VSTD::distance(__first, __last)), ::cuda::experimental::uninit)
{
this->__copy_cross<_Iter>(__first, __last, __unwrapped_begin(), __size_);
}
_CCCL_HIDE_FROM_ABI async_buffer(const __env_t& __env, _CUDA_VSTD::initializer_list<_Tp> __ilist)
: async_buffer(__env, __ilist.size(), ::cuda::experimental::uninit)
{
this->__copy_cross(__ilist.begin(), __ilist.end(), __unwrapped_begin(), __size_);
}
_CCCL_TEMPLATE(class _Range)
_CCCL_REQUIRES(__compatible_range<_Range> _CCCL_AND _CUDA_VRANGES::forward_range<_Range> _CCCL_AND
_CUDA_VRANGES::sized_range<_Range>)
_CCCL_HIDE_FROM_ABI async_buffer(const __env_t& __env, _Range&& __range)
: async_buffer(__env, static_cast<size_type>(_CUDA_VRANGES::size(__range)), ::cuda::experimental::uninit)
{
using _Iter = _CUDA_VRANGES::iterator_t<_Range>;
this->__copy_cross<_Iter>(
_CUDA_VRANGES::begin(__range), _CUDA_VRANGES::__unwrap_end(__range), __unwrapped_begin(), __size_);
}
#ifndef _CCCL_DOXYGEN_INVOKED // doxygen conflates the overloads
_CCCL_TEMPLATE(class _Range)
_CCCL_REQUIRES(__compatible_range<_Range> _CCCL_AND _CUDA_VRANGES::forward_range<_Range> _CCCL_AND(
!_CUDA_VRANGES::sized_range<_Range>))
_CCCL_HIDE_FROM_ABI async_buffer(const __env_t& __env, _Range&& __range)
: async_buffer(
__env,
static_cast<size_type>(_CUDA_VRANGES::distance(_CUDA_VRANGES::begin(__range), _CUDA_VRANGES::end(__range))),
::cuda::experimental::uninit)
{
using _Iter = _CUDA_VRANGES::iterator_t<_Range>;
this->__copy_cross<_Iter>(
_CUDA_VRANGES::begin(__range), _CUDA_VRANGES::__unwrap_end(__range), __unwrapped_begin(), __size_);
}
#endif // _CCCL_DOXYGEN_INVOKED
[[nodiscard]] _CCCL_HIDE_FROM_ABI iterator begin() noexcept
{
return iterator{__buf_.data()};
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_iterator begin() const noexcept
{
return const_iterator{__buf_.data()};
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_iterator cbegin() const noexcept
{
return const_iterator{__buf_.data()};
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI iterator end() noexcept
{
return iterator{__buf_.data() + __size_};
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_iterator end() const noexcept
{
return const_iterator{__buf_.data() + __size_};
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_iterator cend() const noexcept
{
return const_iterator{__buf_.data() + __size_};
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI reverse_iterator rbegin() noexcept
{
return reverse_iterator{end()};
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept
{
return const_reverse_iterator{end()};
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept
{
return const_reverse_iterator{end()};
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI reverse_iterator rend() noexcept
{
return reverse_iterator{begin()};
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept
{
return const_reverse_iterator{begin()};
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept
{
return const_reverse_iterator{begin()};
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI pointer data() noexcept
{
return __buf_.data();
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_pointer data() const noexcept
{
return __buf_.data();
}
#ifndef _CCCL_DOXYGEN_INVOKED
[[nodiscard]] _CCCL_HIDE_FROM_ABI pointer __unwrapped_begin() noexcept
{
return __buf_.data();
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_pointer __unwrapped_begin() const noexcept
{
return __buf_.data();
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI pointer __unwrapped_end() noexcept
{
return __buf_.data() + __size_;
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_pointer __unwrapped_end() const noexcept
{
return __buf_.data() + __size_;
}
#endif // _CCCL_DOXYGEN_INVOKED
[[nodiscard]] _CCCL_HIDE_FROM_ABI reference get(const size_type __n) noexcept
{
_CCCL_ASSERT(__n < __size_, "cuda::experimental::async_vector::get out of range!");
this->wait();
return begin()[__n];
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_reference get(const size_type __n) const noexcept
{
_CCCL_ASSERT(__n < __size_, "cuda::experimental::async_vector::get out of range!");
this->wait();
return begin()[__n];
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI reference get_unsynchronized(const size_type __n) noexcept
{
_CCCL_ASSERT(__n < __size_, "cuda::experimental::async_vector::get_unsynchronized out of range!");
return begin()[__n];
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const_reference get_unsynchronized(const size_type __n) const noexcept
{
_CCCL_ASSERT(__n < __size_, "cuda::experimental::async_vector::get_unsynchronized out of range!");
return begin()[__n];
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI size_type size() const noexcept
{
return __size_;
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI bool empty() const noexcept
{
return __size_ == 0;
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI const __resource_t& get_memory_resource() const noexcept
{
return __buf_.get_memory_resource();
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI constexpr ::cuda::stream_ref get_stream() const noexcept
{
return __buf_.get_stream();
}
_CCCL_HIDE_FROM_ABI constexpr void change_stream(::cuda::stream_ref __new_stream)
{
__buf_.change_stream(__new_stream);
}
_CCCL_HIDE_FROM_ABI constexpr void change_stream_unsynchronized(::cuda::stream_ref __new_stream) noexcept
{
__buf_.change_stream_unsynchronized(__new_stream);
}
[[nodiscard]] _CCCL_HIDE_FROM_ABI constexpr __policy_t get_execution_policy() const noexcept
{
return __policy_;
}
_CCCL_HIDE_FROM_ABI constexpr void set_execution_policy(__policy_t __new_policy) noexcept
{
__policy_ = __new_policy;
}
_CCCL_HIDE_FROM_ABI void wait() const
{
__buf_.get_stream().wait();
}
_CCCL_HIDE_FROM_ABI void assign(const size_type __count, const _Tp& __value)
{
if (__size_ < __count)
{
(void) __buf_.__replace_allocation(__count);
}
this->__fill_n(__unwrapped_begin(), __count, __value);
__size_ = __count;
}
_CCCL_TEMPLATE(class _Iter)
_CCCL_REQUIRES(_CUDA_VSTD::__is_cpp17_forward_iterator<_Iter>::value)
_CCCL_HIDE_FROM_ABI void assign(_Iter __first, _Iter __last)
{
const auto __count = static_cast<size_type>(_CUDA_VSTD::distance(__first, __last));
this->__assign_impl(__count, __first, __last);
}
_CCCL_HIDE_FROM_ABI void assign(_CUDA_VSTD::initializer_list<_Tp> __ilist)
{
const auto __count = static_cast<size_type>(__ilist.size());
this->__assign_impl(__count, __ilist.begin(), __ilist.end());
}
_CCCL_TEMPLATE(class _Range)
_CCCL_REQUIRES(__compatible_range<_Range> _CCCL_AND _CUDA_VRANGES::forward_range<_Range> _CCCL_AND
_CUDA_VRANGES::sized_range<_Range>)
_CCCL_HIDE_FROM_ABI void assign_range(_Range&& __range)
{
const auto __count = _CUDA_VRANGES::size(__range);
using _Iter = _CUDA_VRANGES::iterator_t<_Range>;
this->__assign_impl<_Iter>(__count, _CUDA_VSTD::begin(__range), _CUDA_VRANGES::__unwrap_end(__range));
}
#ifndef _CCCL_DOXYGEN_INVOKED // doxygen conflates the overloads
_CCCL_TEMPLATE(class _Range)
_CCCL_REQUIRES(__compatible_range<_Range> _CCCL_AND _CUDA_VRANGES::forward_range<_Range> _CCCL_AND(
!_CUDA_VRANGES::sized_range<_Range>))
_CCCL_HIDE_FROM_ABI void assign_range(_Range&& __range)
{
const auto __first = _CUDA_VRANGES::begin(__range);
const auto __last = _CUDA_VRANGES::__unwrap_end(__range);
const auto __count = static_cast<size_type>(_CUDA_VRANGES::distance(__first, __last));
using _Iter = _CUDA_VRANGES::iterator_t<_Range>;
this->__assign_impl<_Iter>(__count, __first, __last);
}
#endif // _CCCL_DOXYGEN_INVOKED
_CCCL_HIDE_FROM_ABI void swap(async_buffer& __other) noexcept
{
_CUDA_VSTD::swap(__buf_, __other.__buf_);
_CUDA_VSTD::swap(__size_, __other.__size_);
}
_CCCL_HIDE_FROM_ABI friend void swap(async_buffer& __lhs, async_buffer& __rhs) noexcept
{
__lhs.swap(__rhs);
}
_CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI bool operator==(const async_buffer& __lhs, const async_buffer& __rhs)
{
if constexpr (__is_host_only)
{
// need to wait here because `host_launch` does not return values, so we cannot easily put it in stream order
__lhs.wait();
__rhs.wait();
return _CUDA_VSTD::equal(
__lhs.__unwrapped_begin(), __lhs.__unwrapped_end(), __rhs.__unwrapped_begin(), __rhs.__unwrapped_end());
}
else
{
::cuda::experimental::__ensure_current_device __guard(__lhs.get_stream().get());
return (__lhs.size() == __rhs.size())
&& thrust::equal(thrust::cuda::par_nosync.on(__lhs.get_stream()),
__lhs.__unwrapped_begin(),
__lhs.__unwrapped_end(),
__rhs.__unwrapped_begin());
}
_CCCL_UNREACHABLE();
}
#if _CCCL_STD_VER <= 2017
_CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI bool operator!=(const async_buffer& __lhs, const async_buffer& __rhs)
{
return !(__lhs == __rhs);
}
#endif // _CCCL_STD_VER <= 2017
#ifndef _CCCL_DOXYGEN_INVOKED // friend functions are currently broken
_CCCL_TEMPLATE(class _Property)
_CCCL_REQUIRES((!property_with_value<_Property>) _CCCL_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>)
_CCCL_HIDE_FROM_ABI friend void get_property(const async_buffer&, _Property) noexcept {}
#endif // _CCCL_DOXYGEN_INVOKED
};
template <class _Tp>
using async_device_buffer = async_buffer<_Tp, _CUDA_VMR::device_accessible>;
template <class _Tp>
using async_host_buffer = async_buffer<_Tp, _CUDA_VMR::host_accessible>;
template <class _Tp, class... _TargetProperties, class... _SourceProperties>
async_buffer<_Tp, _TargetProperties...> make_async_buffer(
const async_buffer<_Tp, _SourceProperties...>& __source,
any_async_resource<_TargetProperties...> __mr,
cuda::stream_ref __stream)
{
env_t<_TargetProperties...> __env{__mr, __stream};
async_buffer<_Tp, _TargetProperties...> __res{__env, __source.size(), uninit};
__source.wait();
_CCCL_TRY_CUDA_API(
::cudaMemcpyAsync,
"cudax::async_buffer::__copy_cross: failed to copy data",
__res.__unwrapped_begin(),
__source.__unwrapped_begin(),
sizeof(_Tp) * __source.size(),
cudaMemcpyKind::cudaMemcpyDefault,
__stream.get());
return __res;
}
template <class _Tp, class... _TargetProperties, class... _SourceProperties>
async_buffer<_Tp, _TargetProperties...> make_async_buffer(
const async_buffer<_Tp, _SourceProperties...>& __source, any_async_resource<_TargetProperties...> __mr)
{
return ::cuda::experimental::make_async_buffer(__source, __mr, __source.get_stream());
}
template <class _Tp, class... _SourceProperties>
async_buffer<_Tp, _SourceProperties...> make_async_buffer(const async_buffer<_Tp, _SourceProperties...>& __source)
{
return ::cuda::experimental::make_async_buffer(__source, __source.get_memory_resource(), __source.get_stream());
}
} // namespace cuda::experimental
_CCCL_POP_MACROS
#endif //__CUDAX__CONTAINER_ASYNC_BUFFER__