async_buffer.cuh

File members: include/cuda/experimental/__container/async_buffer.cuh
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef __CUDAX__CONTAINER_ASYNC_BUFFER__
#define __CUDAX__CONTAINER_ASYNC_BUFFER__

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <thrust/copy.h>
#include <thrust/equal.h>
#include <thrust/execution_policy.h>
#include <thrust/fill.h>
#include <thrust/for_each.h>

#include <cuda/__memory_resource/get_memory_resource.h>
#include <cuda/__memory_resource/properties.h>
#include <cuda/__memory_resource/resource_ref.h>
#include <cuda/__stream/get_stream.h>
#include <cuda/std/__algorithm/copy.h>
#include <cuda/std/__algorithm/equal.h>
#include <cuda/std/__algorithm/fill.h>
#include <cuda/std/__algorithm/lexicographical_compare.h>
#include <cuda/std/__algorithm/move_backward.h>
#include <cuda/std/__algorithm/rotate.h>
#include <cuda/std/__iterator/concepts.h>
#include <cuda/std/__iterator/distance.h>
#include <cuda/std/__iterator/iter_move.h>
#include <cuda/std/__iterator/reverse_iterator.h>
#include <cuda/std/__memory/construct_at.h>
#include <cuda/std/__memory/temporary_buffer.h>
#include <cuda/std/__memory/uninitialized_algorithms.h>
#include <cuda/std/__ranges/access.h>
#include <cuda/std/__ranges/concepts.h>
#include <cuda/std/__ranges/size.h>
#include <cuda/std/__ranges/unwrap_end.h>
#include <cuda/std/__type_traits/is_nothrow_move_assignable.h>
#include <cuda/std/__type_traits/is_swappable.h>
#include <cuda/std/__type_traits/is_trivially_copyable.h>
#include <cuda/std/__type_traits/type_list.h>
#include <cuda/std/__utility/forward.h>
#include <cuda/std/__utility/move.h>
#include <cuda/std/cstdint>
#include <cuda/std/detail/libcxx/include/stdexcept>
#include <cuda/std/initializer_list>
#include <cuda/std/limits>

#include <cuda/experimental/__container/heterogeneous_iterator.cuh>
#include <cuda/experimental/__container/uninitialized_async_buffer.cuh>
#include <cuda/experimental/__detail/utility.cuh>
#include <cuda/experimental/__execution/env.cuh>
#include <cuda/experimental/__execution/policy.cuh>
#include <cuda/experimental/__launch/host_launch.cuh>
#include <cuda/experimental/__memory_resource/any_resource.cuh>
#include <cuda/experimental/__memory_resource/properties.cuh>
#include <cuda/experimental/__utility/ensure_current_device.cuh>
#include <cuda/experimental/__utility/select_execution_space.cuh>

#include <cuda/std/__cccl/prologue.h>

namespace cuda::experimental
{

template <class _Tp, class... _Properties>
class async_buffer
{
public:
  using value_type             = _Tp;
  using reference              = _Tp&;
  using const_reference        = const _Tp&;
  using pointer                = _Tp*;
  using const_pointer          = const _Tp*;
  using iterator               = heterogeneous_iterator<_Tp, _Properties...>;
  using const_iterator         = heterogeneous_iterator<const _Tp, _Properties...>;
  using reverse_iterator       = _CUDA_VSTD::reverse_iterator<iterator>;
  using const_reverse_iterator = _CUDA_VSTD::reverse_iterator<const_iterator>;
  using size_type              = _CUDA_VSTD::size_t;
  using difference_type        = _CUDA_VSTD::ptrdiff_t;

  using __env_t          = ::cuda::experimental::env_t<_Properties...>;
  using __policy_t       = ::cuda::experimental::execution::any_execution_policy;
  using __buffer_t       = ::cuda::experimental::uninitialized_async_buffer<_Tp, _Properties...>;
  using __resource_t     = ::cuda::experimental::any_async_resource<_Properties...>;
  using __resource_ref_t = _CUDA_VMR::async_resource_ref<_Properties...>;

  template <class, class...>
  friend class async_buffer;

  // For now we require trivially copyable type to simplify the implementation
  static_assert(_CCCL_TRAIT(_CUDA_VSTD::is_trivially_copyable, _Tp),
                "cuda::experimental::async_buffer requires T to be trivially copyable.");

  // At least one of the properties must signal an execution space
  static_assert(_CUDA_VMR::__contains_execution_space_property<_Properties...>,
                "The properties of cuda::experimental::async_buffer must contain at least one execution space "
                "property!");

  static constexpr bool __is_host_only = __select_execution_space<_Properties...> == _ExecutionSpace::__host;

private:
  __buffer_t __buf_;

  template <class _Range>
  static constexpr bool __compatible_range = _CUDA_VRANGES::__container_compatible_range<_Range, _Tp>;

  template <class... _OtherProperties>
  static constexpr bool __properties_match =
    _CUDA_VSTD::__type_set_contains_v<_CUDA_VSTD::__make_type_set<_OtherProperties...>, _Properties...>;

  template <class... _OtherProperties>
  static constexpr cudaMemcpyKind __transfer_kind =
    __select_execution_space<_OtherProperties...> == _ExecutionSpace::__host
      ? (__is_host_only ? cudaMemcpyHostToHost : cudaMemcpyHostToDevice)
      : (__is_host_only ? cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice);

  __resource_ref_t __borrow_resource() const noexcept
  {
    return const_cast<__resource_t&>(__buf_.memory_resource());
  }

  template <class _Iter>
  _CCCL_HIDE_FROM_ABI void __copy_cross(_Iter __first, [[maybe_unused]] _Iter __last, pointer __dest, size_type __count)
  {
    if (__count == 0)
    {
      return;
    }

    static_assert(_CUDA_VSTD::contiguous_iterator<_Iter>, "Non contiguous iterators are not supported");
    // TODO use batched memcpy for non-contiguous iterators, it allows to specify stream ordered access
    _CCCL_TRY_CUDA_API(
      ::cudaMemcpyAsync,
      "cudax::async_buffer::__copy_cross: failed to copy data",
      __dest,
      _CUDA_VSTD::to_address(__first),
      sizeof(_Tp) * __count,
      ::cudaMemcpyDefault,
      __buf_.stream().get());
  }

  _CCCL_HIDE_FROM_ABI void __value_initialize_n(pointer __first, size_type __count)
  {
    if (__count == 0)
    {
      return;
    }

    if constexpr (__is_host_only)
    {
      ::cuda::experimental::host_launch(
        __buf_.stream(), _CUDA_VSTD::uninitialized_value_construct_n<pointer, size_type>, __first, __count);
    }
    else
    {
      ::cuda::experimental::__ensure_current_device __guard(__buf_.stream());
      thrust::fill_n(thrust::cuda::par_nosync.on(__buf_.stream().get()), __first, __count, _Tp());
    }
  }

  _CCCL_HIDE_FROM_ABI void __fill_n(pointer __first, size_type __count, const _Tp& __value)
  {
    if (__count == 0)
    {
      return;
    }

    if constexpr (__is_host_only)
    {
      ::cuda::experimental::host_launch(
        __buf_.stream(), _CUDA_VSTD::uninitialized_fill_n<pointer, size_type, _Tp>, __first, __count, __value);
    }
    else
    {
      ::cuda::experimental::__ensure_current_device __guard(__buf_.stream());
      thrust::fill_n(thrust::cuda::par_nosync.on(__buf_.stream().get()), __first, __count, __value);
    }
  }

public:

  _CCCL_HIDE_FROM_ABI async_buffer(const async_buffer& __other)
      : __buf_(__other.memory_resource(), __other.stream(), __other.size())
  {
    this->__copy_cross<const_pointer>(
      __other.__unwrapped_begin(), __other.__unwrapped_end(), __unwrapped_begin(), __other.size());
  }

  _CCCL_HIDE_FROM_ABI async_buffer(async_buffer&& __other) noexcept
      : __buf_(_CUDA_VSTD::move(__other.__buf_))
  {}

  _CCCL_TEMPLATE(class... _OtherProperties)
  _CCCL_REQUIRES(__properties_match<_OtherProperties...>)
  _CCCL_HIDE_FROM_ABI explicit async_buffer(const async_buffer<_Tp, _OtherProperties...>& __other)
      : __buf_(__other.memory_resource(), __other.stream(), __other.size())
  {
    this->__copy_cross<const_pointer>(
      __other.__unwrapped_begin(), __other.__unwrapped_end(), __unwrapped_begin(), __other.size());
  }

  _CCCL_TEMPLATE(class... _OtherProperties)
  _CCCL_REQUIRES(__properties_match<_OtherProperties...>)
  _CCCL_HIDE_FROM_ABI explicit async_buffer(async_buffer<_Tp, _OtherProperties...>&& __other) noexcept
      : __buf_(_CUDA_VSTD::move(__other.__buf_))
  {}

  _CCCL_HIDE_FROM_ABI async_buffer(const __env_t& __env)
      : async_buffer(__env, 0, ::cuda::experimental::no_init)
  {}

  _CCCL_HIDE_FROM_ABI explicit async_buffer(const __env_t& __env, const size_type __size)
      : async_buffer(__env, __size, ::cuda::experimental::no_init)
  {
    this->__value_initialize_n(__unwrapped_begin(), __size);
  }

  _CCCL_HIDE_FROM_ABI explicit async_buffer(const __env_t& __env, const size_type __size, const _Tp& __value)
      : async_buffer(__env, __size, ::cuda::experimental::no_init)
  {
    this->__fill_n(__unwrapped_begin(), __size, __value);
  }

  _CCCL_HIDE_FROM_ABI explicit async_buffer(
    const __env_t& __env, const size_type __size, ::cuda::experimental::no_init_t)
      : __buf_(::cuda::mr::get_memory_resource(__env), ::cuda::get_stream(__env), __size)
  {}

  _CCCL_TEMPLATE(class _Iter)
  _CCCL_REQUIRES(_CUDA_VSTD::__is_cpp17_forward_iterator<_Iter>::value)
  _CCCL_HIDE_FROM_ABI async_buffer(const __env_t& __env, _Iter __first, _Iter __last)
      : async_buffer(__env, static_cast<size_type>(_CUDA_VSTD::distance(__first, __last)), ::cuda::experimental::no_init)
  {
    this->__copy_cross<_Iter>(__first, __last, __unwrapped_begin(), __buf_.size());
  }

  _CCCL_HIDE_FROM_ABI async_buffer(const __env_t& __env, _CUDA_VSTD::initializer_list<_Tp> __ilist)
      : async_buffer(__env, __ilist.size(), ::cuda::experimental::no_init)
  {
    this->__copy_cross(__ilist.begin(), __ilist.end(), __unwrapped_begin(), __buf_.size());
  }

  _CCCL_TEMPLATE(class _Range)
  _CCCL_REQUIRES(__compatible_range<_Range> _CCCL_AND _CUDA_VRANGES::forward_range<_Range> _CCCL_AND
                   _CUDA_VRANGES::sized_range<_Range>)
  _CCCL_HIDE_FROM_ABI async_buffer(const __env_t& __env, _Range&& __range)
      : async_buffer(__env, static_cast<size_type>(_CUDA_VRANGES::size(__range)), ::cuda::experimental::no_init)
  {
    using _Iter = _CUDA_VRANGES::iterator_t<_Range>;
    this->__copy_cross<_Iter>(
      _CUDA_VRANGES::begin(__range), _CUDA_VRANGES::__unwrap_end(__range), __unwrapped_begin(), __buf_.size());
  }

#ifndef _CCCL_DOXYGEN_INVOKED // doxygen conflates the overloads
  _CCCL_TEMPLATE(class _Range)
  _CCCL_REQUIRES(__compatible_range<_Range> _CCCL_AND _CUDA_VRANGES::forward_range<_Range> _CCCL_AND(
    !_CUDA_VRANGES::sized_range<_Range>))
  _CCCL_HIDE_FROM_ABI async_buffer(const __env_t& __env, _Range&& __range)
      : async_buffer(
          __env,
          static_cast<size_type>(_CUDA_VRANGES::distance(_CUDA_VRANGES::begin(__range), _CUDA_VRANGES::end(__range))),
          ::cuda::experimental::no_init)
  {
    using _Iter = _CUDA_VRANGES::iterator_t<_Range>;
    this->__copy_cross<_Iter>(
      _CUDA_VRANGES::begin(__range), _CUDA_VRANGES::__unwrap_end(__range), __unwrapped_begin(), __buf_.size());
  }
#endif // _CCCL_DOXYGEN_INVOKED

  [[nodiscard]] _CCCL_HIDE_FROM_ABI iterator begin() noexcept
  {
    return iterator{__buf_.data()};
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const_iterator begin() const noexcept
  {
    return const_iterator{__buf_.data()};
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const_iterator cbegin() const noexcept
  {
    return const_iterator{__buf_.data()};
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI iterator end() noexcept
  {
    return iterator{__buf_.data() + __buf_.size()};
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const_iterator end() const noexcept
  {
    return const_iterator{__buf_.data() + __buf_.size()};
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const_iterator cend() const noexcept
  {
    return const_iterator{__buf_.data() + __buf_.size()};
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI reverse_iterator rbegin() noexcept
  {
    return reverse_iterator{end()};
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept
  {
    return const_reverse_iterator{end()};
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept
  {
    return const_reverse_iterator{end()};
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI reverse_iterator rend() noexcept
  {
    return reverse_iterator{begin()};
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept
  {
    return const_reverse_iterator{begin()};
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept
  {
    return const_reverse_iterator{begin()};
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI pointer data() noexcept
  {
    return __buf_.data();
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const_pointer data() const noexcept
  {
    return __buf_.data();
  }

#ifndef _CCCL_DOXYGEN_INVOKED
  [[nodiscard]] _CCCL_HIDE_FROM_ABI pointer __unwrapped_begin() noexcept
  {
    return __buf_.data();
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const_pointer __unwrapped_begin() const noexcept
  {
    return __buf_.data();
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI pointer __unwrapped_end() noexcept
  {
    return __buf_.data() + __buf_.size();
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const_pointer __unwrapped_end() const noexcept
  {
    return __buf_.data() + __buf_.size();
  }
#endif // _CCCL_DOXYGEN_INVOKED

  [[nodiscard]] _CCCL_HIDE_FROM_ABI reference get_unsynchronized(const size_type __n) noexcept
  {
    _CCCL_ASSERT(__n < __buf_.size(), "cuda::experimental::async_buffer::get_unsynchronized out of range!");
    return begin()[__n];
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const_reference get_unsynchronized(const size_type __n) const noexcept
  {
    _CCCL_ASSERT(__n < __buf_.size(), "cuda::experimental::async_buffer::get_unsynchronized out of range!");
    return begin()[__n];
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI size_type size() const noexcept
  {
    return __buf_.size();
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI bool empty() const noexcept
  {
    return __buf_.size() == 0;
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI const __resource_t& memory_resource() const noexcept
  {
    return __buf_.memory_resource();
  }

  [[nodiscard]] _CCCL_HIDE_FROM_ABI constexpr stream_ref stream() const noexcept
  {
    return __buf_.stream();
  }

  _CCCL_HIDE_FROM_ABI constexpr void change_stream(stream_ref __new_stream)
  {
    __buf_.change_stream(__new_stream);
  }

  _CCCL_HIDE_FROM_ABI constexpr void change_stream_unsynchronized(stream_ref __new_stream) noexcept
  {
    __buf_.change_stream_unsynchronized(__new_stream);
  }

  _CCCL_HIDE_FROM_ABI void operator=(async_buffer&& __other)
  {
    __buf_ = _CUDA_VSTD::move(__other.__buf_);
  }

  _CCCL_HIDE_FROM_ABI void swap(async_buffer& __other) noexcept
  {
    _CUDA_VSTD::swap(__buf_, __other.__buf_);
  }

  _CCCL_HIDE_FROM_ABI friend void swap(async_buffer& __lhs, async_buffer& __rhs) noexcept
  {
    __lhs.swap(__rhs);
  }

  _CCCL_HIDE_FROM_ABI void destroy()
  {
    __buf_.destroy();
  }

  template <class _Tp2 = _Tp>
  [[nodiscard]] _CCCL_HIDE_FROM_ABI friend auto
  __cudax_launch_transform(::cuda::stream_ref, async_buffer& __self) noexcept
    _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<device_accessible, _Properties...>)
  {
    // TODO add auto synchronization
    return {__self.__unwrapped_begin(), __self.size()};
  }

  template <class _Tp2 = _Tp>
  [[nodiscard]] _CCCL_HIDE_FROM_ABI friend auto
  __cudax_launch_transform(::cuda::stream_ref, const async_buffer& __self) noexcept
    _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<device_accessible, _Properties...>)
  {
    // TODO add auto synchronization
    return {__self.__unwrapped_begin(), __self.size()};
  }

#ifndef _CCCL_DOXYGEN_INVOKED // friend functions are currently broken
  _CCCL_TEMPLATE(class _Property)
  _CCCL_REQUIRES((!property_with_value<_Property>) _CCCL_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>)
  _CCCL_HIDE_FROM_ABI friend void get_property(const async_buffer&, _Property) noexcept {}
#endif // _CCCL_DOXYGEN_INVOKED
};

template <class _Tp>
using async_device_buffer = async_buffer<_Tp, _CUDA_VMR::device_accessible>;

template <class _Tp>
using async_host_buffer = async_buffer<_Tp, _CUDA_VMR::host_accessible>;

template <class _Tp, class _PropsList>
using __buffer_type_for_props = typename _CUDA_VSTD::remove_reference_t<_PropsList>::template rebind<async_buffer, _Tp>;

template <typename _BufferTo, typename _BufferFrom>
void __copy_cross_buffers(stream_ref __stream, _BufferTo& __to, const _BufferFrom& __from)
{
  __stream.wait(__from.stream());
  _CCCL_TRY_CUDA_API(
    ::cudaMemcpyAsync,
    "make_async_buffer: failed to copy data",
    __to.__unwrapped_begin(),
    __from.__unwrapped_begin(),
    sizeof(typename _BufferTo::value_type) * __from.size(),
    cudaMemcpyKind::cudaMemcpyDefault,
    __stream.get());
}

template <class _Tp, class... _TargetProperties, class... _SourceProperties>
async_buffer<_Tp, _TargetProperties...> make_async_buffer(
  stream_ref __stream,
  any_async_resource<_TargetProperties...> __mr,
  const async_buffer<_Tp, _SourceProperties...>& __source)
{
  env_t<_TargetProperties...> __env{__mr, __stream};
  async_buffer<_Tp, _TargetProperties...> __res{__env, __source.size(), no_init};

  __copy_cross_buffers(__stream, __res, __source);

  return __res;
}

_CCCL_TEMPLATE(class _Tp, class _Resource, class... _SourceProperties)
_CCCL_REQUIRES(_CUDA_VMR::async_resource<_Resource> _CCCL_AND __has_default_queries<_Resource>)
auto make_async_buffer(stream_ref __stream, _Resource&& __mr, const async_buffer<_Tp, _SourceProperties...>& __source)
{
  using __buffer_type = __buffer_type_for_props<_Tp, typename _CUDA_VSTD::decay_t<_Resource>::default_queries>;
  typename __buffer_type::__env_t __env{__mr, __stream};
  auto __res = __buffer_type{__env, __source.size(), uninit};

  __copy_cross_buffers(__stream, __res, __source);

  return __res;
}

// Empty buffer make function
template <class _Tp, class... _Properties>
async_buffer<_Tp, _Properties...> make_async_buffer(stream_ref __stream, any_async_resource<_Properties...> __mr)
{
  env_t<_Properties...> __env{__mr, __stream};
  return async_buffer<_Tp, _Properties...>{__env};
}

_CCCL_TEMPLATE(class _Tp, class _Resource)
_CCCL_REQUIRES(_CUDA_VMR::async_resource<_Resource> _CCCL_AND __has_default_queries<_Resource>)
auto make_async_buffer(stream_ref __stream, _Resource&& __mr)
{
  using __buffer_type = __buffer_type_for_props<_Tp, typename _CUDA_VSTD::decay_t<_Resource>::default_queries>;
  typename __buffer_type::__env_t __env{__mr, __stream};
  return __buffer_type{__env};
}

// Size-only make function
template <class _Tp, class... _Properties>
async_buffer<_Tp, _Properties...>
make_async_buffer(stream_ref __stream, any_async_resource<_Properties...> __mr, size_t __size)
{
  env_t<_Properties...> __env{__mr, __stream};
  return async_buffer<_Tp, _Properties...>{__env, __size};
}

_CCCL_TEMPLATE(class _Tp, class _Resource)
_CCCL_REQUIRES(_CUDA_VMR::async_resource<_Resource> _CCCL_AND __has_default_queries<_Resource>)
auto make_async_buffer(stream_ref __stream, _Resource&& __mr, size_t __size)
{
  using __buffer_type = __buffer_type_for_props<_Tp, typename _CUDA_VSTD::decay_t<_Resource>::default_queries>;
  typename __buffer_type::__env_t __env{__mr, __stream};
  return __buffer_type{__env, __size};
}

// Size and value make function
template <class _Tp, class... _Properties>
async_buffer<_Tp, _Properties...>
make_async_buffer(stream_ref __stream, any_async_resource<_Properties...> __mr, size_t __size, const _Tp& __value)
{
  env_t<_Properties...> __env{__mr, __stream};
  return async_buffer<_Tp, _Properties...>{__env, __size, __value};
}

_CCCL_TEMPLATE(class _Tp, class _Resource)
_CCCL_REQUIRES(_CUDA_VMR::async_resource<_Resource> _CCCL_AND __has_default_queries<_Resource>)
auto make_async_buffer(stream_ref __stream, _Resource&& __mr, size_t __size, const _Tp& __value)
{
  using __buffer_type = __buffer_type_for_props<_Tp, typename _CUDA_VSTD::decay_t<_Resource>::default_queries>;
  typename __buffer_type::__env_t __env{__mr, __stream};
  return __buffer_type{__env, __size, __value};
}

// Size with no initialization make function
template <class _Tp, class... _Properties>
async_buffer<_Tp, _Properties...> make_async_buffer(
  stream_ref __stream, any_async_resource<_Properties...> __mr, size_t __size, ::cuda::experimental::no_init_t)
{
  env_t<_Properties...> __env{__mr, __stream};
  return async_buffer<_Tp, _Properties...>{__env, __size, ::cuda::experimental::no_init};
}

_CCCL_TEMPLATE(class _Tp, class _Resource)
_CCCL_REQUIRES(_CUDA_VMR::async_resource<_Resource> _CCCL_AND __has_default_queries<_Resource>)
auto make_async_buffer(stream_ref __stream, _Resource&& __mr, size_t __size, ::cuda::experimental::no_init_t)
{
  using __buffer_type = __buffer_type_for_props<_Tp, typename _CUDA_VSTD::decay_t<_Resource>::default_queries>;
  typename __buffer_type::__env_t __env{__mr, __stream};
  return __buffer_type{__env, __size, ::cuda::experimental::no_init};
}

// Iterator range make function
_CCCL_TEMPLATE(class _Tp, class... _Properties, class _Iter)
_CCCL_REQUIRES(_CUDA_VSTD::__is_cpp17_forward_iterator<_Iter>::value)
async_buffer<_Tp, _Properties...>
make_async_buffer(stream_ref __stream, any_async_resource<_Properties...> __mr, _Iter __first, _Iter __last)
{
  env_t<_Properties...> __env{__mr, __stream};
  return async_buffer<_Tp, _Properties...>{__env, __first, __last};
}

_CCCL_TEMPLATE(class _Tp, class _Resource, class _Iter)
_CCCL_REQUIRES(_CUDA_VMR::async_resource<_Resource> _CCCL_AND __has_default_queries<_Resource> _CCCL_AND
                 _CUDA_VSTD::__is_cpp17_forward_iterator<_Iter>::value)
auto make_async_buffer(stream_ref __stream, _Resource&& __mr, _Iter __first, _Iter __last)
{
  using __buffer_type = __buffer_type_for_props<_Tp, typename _CUDA_VSTD::decay_t<_Resource>::default_queries>;
  typename __buffer_type::__env_t __env{__mr, __stream};
  return __buffer_type{__env, __first, __last};
}

// Initializer list make function
template <class _Tp, class... _Properties>
async_buffer<_Tp, _Properties...> make_async_buffer(
  stream_ref __stream, any_async_resource<_Properties...> __mr, _CUDA_VSTD::initializer_list<_Tp> __ilist)
{
  env_t<_Properties...> __env{__mr, __stream};
  return async_buffer<_Tp, _Properties...>{__env, __ilist};
}

_CCCL_TEMPLATE(class _Tp, class _Resource)
_CCCL_REQUIRES(_CUDA_VMR::async_resource<_Resource> _CCCL_AND __has_default_queries<_Resource>)
auto make_async_buffer(stream_ref __stream, _Resource&& __mr, _CUDA_VSTD::initializer_list<_Tp> __ilist)
{
  using __buffer_type = __buffer_type_for_props<_Tp, typename _CUDA_VSTD::decay_t<_Resource>::default_queries>;
  typename __buffer_type::__env_t __env{__mr, __stream};
  return __buffer_type{__env, __ilist};
}

// Range make function for ranges
_CCCL_TEMPLATE(class _Tp, class... _Properties, class _Range)
_CCCL_REQUIRES(_CUDA_VRANGES::forward_range<_Range>)
async_buffer<_Tp, _Properties...>
make_async_buffer(stream_ref __stream, any_async_resource<_Properties...> __mr, _Range&& __range)
{
  env_t<_Properties...> __env{__mr, __stream};
  return async_buffer<_Tp, _Properties...>{__env, _CUDA_VSTD::forward<_Range>(__range)};
}

_CCCL_TEMPLATE(class _Tp, class _Resource, class _Range)
_CCCL_REQUIRES(_CUDA_VMR::async_resource<_Resource> _CCCL_AND __has_default_queries<_Resource> _CCCL_AND
                 _CUDA_VRANGES::forward_range<_Range>)
auto make_async_buffer(stream_ref __stream, _Resource&& __mr, _Range&& __range)
{
  using __buffer_type = __buffer_type_for_props<_Tp, typename _CUDA_VSTD::decay_t<_Resource>::default_queries>;
  typename __buffer_type::__env_t __env{__mr, __stream};
  return __buffer_type{__env, _CUDA_VSTD::forward<_Range>(__range)};
}

} // namespace cuda::experimental

#include <cuda/std/__cccl/epilogue.h>

#endif //__CUDAX__CONTAINER_ASYNC_BUFFER__