include/cuda/experimental/__launch/configuration.cuh
File members: include/cuda/experimental/__launch/configuration.cuh
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
#ifndef _CUDAX__LAUNCH_CONFIGURATION
#define _CUDAX__LAUNCH_CONFIGURATION
#include <cuda/std/span>
#include <cuda/std/tuple>
#include <cuda/experimental/__detail/utility.cuh>
#include <cuda/experimental/hierarchy.cuh>
#if _CCCL_STD_VER >= 2017
namespace cuda::experimental
{
template <typename Dimensions, typename... Options>
struct kernel_config;
namespace detail
{
struct launch_option
{
static constexpr bool needs_attribute_space = false;
static constexpr bool is_relevant_on_device = false;
protected:
_CCCL_NODISCARD cudaError_t apply(cudaLaunchConfig_t&, void*) const noexcept
{
return cudaSuccess;
}
};
template <typename Dimensions, typename... Options>
cudaError_t apply_kernel_config(
const kernel_config<Dimensions, Options...>& config, cudaLaunchConfig_t& cuda_config, void* kernel) noexcept;
// Might need to go to the main namespace?
enum class launch_option_kind
{
cooperative_launch,
dynamic_shared_memory,
launch_priority
};
struct option_not_found
{};
template <detail::launch_option_kind Kind>
struct find_option_in_tuple_impl
{
template <typename Option, typename... Options>
_CCCL_DEVICE auto& operator()(const Option& opt, const Options&... rest)
{
if constexpr (Option::kind == Kind)
{
return opt;
}
else
{
return (*this)(rest...);
}
}
_CCCL_DEVICE auto operator()()
{
return option_not_found();
}
};
template <detail::launch_option_kind Kind, typename... Options>
_CCCL_DEVICE auto& find_option_in_tuple(const ::cuda::std::tuple<Options...>& tuple)
{
return ::cuda::std::apply(find_option_in_tuple_impl<Kind>(), tuple);
}
template <typename _Option, typename... _OptionsList>
inline constexpr bool __option_present_in_list = ((_Option::kind == _OptionsList::kind) || ...);
template <typename...>
inline constexpr bool no_duplicate_options = true;
template <typename Option, typename... Rest>
inline constexpr bool no_duplicate_options<Option, Rest...> =
!__option_present_in_list<Option, Rest...> && no_duplicate_options<Rest...>;
} // namespace detail
struct cooperative_launch : public detail::launch_option
{
static constexpr bool needs_attribute_space = true;
static constexpr bool is_relevant_on_device = true;
static constexpr detail::launch_option_kind kind = detail::launch_option_kind::cooperative_launch;
constexpr cooperative_launch() = default;
template <typename Dimensions, typename... Options>
friend cudaError_t detail::apply_kernel_config(
const kernel_config<Dimensions, Options...>& config, cudaLaunchConfig_t& cuda_config, void* kernel) noexcept;
private:
_CCCL_NODISCARD cudaError_t apply(cudaLaunchConfig_t& config, void*) const noexcept
{
cudaLaunchAttribute attr;
attr.id = cudaLaunchAttributeCooperative;
attr.val.cooperative = true;
config.attrs[config.numAttrs++] = attr;
return cudaSuccess;
}
};
template <typename Content, std::size_t Extent = 1, bool NonPortableSize = false>
struct dynamic_shared_memory_option : public detail::launch_option
{
using content_type = Content;
static constexpr std::size_t extent = Extent;
static constexpr bool is_relevant_on_device = true;
static constexpr detail::launch_option_kind kind = detail::launch_option_kind::dynamic_shared_memory;
const std::size_t size;
constexpr dynamic_shared_memory_option(std::size_t set_size) noexcept
: size(set_size)
{}
template <typename Dimensions, typename... Options>
friend cudaError_t detail::apply_kernel_config(
const kernel_config<Dimensions, Options...>& config, cudaLaunchConfig_t& cuda_config, void* kernel) noexcept;
private:
_CCCL_NODISCARD cudaError_t apply(cudaLaunchConfig_t& config, void* kernel) const noexcept
{
cudaFuncAttributes attrs;
int size_needed = static_cast<int>(size * sizeof(Content));
cudaError_t status = cudaFuncGetAttributes(&attrs, kernel);
if ((size_needed > attrs.maxDynamicSharedSizeBytes) && NonPortableSize)
{
// TODO since 12.6 there is a per launch option available, we should switch once compatibility is not an issue
// TODO should we validate the max amount with device props or just pass it through and rely on driver error?
status = cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, size_needed);
if (status != cudaSuccess)
{
return status;
}
}
config.dynamicSmemBytes = size_needed;
return cudaSuccess;
}
};
template <typename Content, std::size_t Extent = 1, bool NonPortableSize = false>
constexpr dynamic_shared_memory_option<Content, Extent, NonPortableSize> dynamic_shared_memory() noexcept
{
static_assert(Extent != ::cuda::std::dynamic_extent, "Size needs to be provided when dynamic_extent is specified");
return dynamic_shared_memory_option<Content, Extent, NonPortableSize>(Extent);
}
template <typename Content, bool NonPortableSize = false>
constexpr dynamic_shared_memory_option<Content, ::cuda::std::dynamic_extent, NonPortableSize>
dynamic_shared_memory(std::size_t count) noexcept
{
return dynamic_shared_memory_option<Content, ::cuda::std::dynamic_extent, NonPortableSize>(count);
}
struct launch_priority : public detail::launch_option
{
static constexpr bool needs_attribute_space = true;
static constexpr bool is_relevant_on_dpevice = false;
static constexpr detail::launch_option_kind kind = detail::launch_option_kind::launch_priority;
unsigned int priority;
launch_priority(unsigned int p) noexcept
: priority(p)
{}
template <typename Dimensions, typename... Options>
friend cudaError_t detail::apply_kernel_config(
const kernel_config<Dimensions, Options...>& config, cudaLaunchConfig_t& cuda_config, void* kernel) noexcept;
private:
_CCCL_NODISCARD cudaError_t apply(cudaLaunchConfig_t& config, void*) const noexcept
{
cudaLaunchAttribute attr;
attr.id = cudaLaunchAttributePriority;
attr.val.priority = priority;
config.attrs[config.numAttrs++] = attr;
return cudaSuccess;
}
};
template <typename... _OptionsToFilter>
struct __filter_options
{
template <bool _Pred, typename _Option>
_CCCL_NODISCARD auto __option_or_empty(const _Option& __option)
{
if constexpr (_Pred)
{
return ::cuda::std::tuple(__option);
}
else
{
return ::cuda::std::tuple();
}
}
template <typename... _Options>
_CCCL_NODISCARD auto operator()(const _Options&... __options)
{
return ::cuda::std::tuple_cat(
__option_or_empty<!detail::__option_present_in_list<_Options, _OptionsToFilter...>>(__options)...);
}
};
template <typename _Dimensions, typename... _Options>
auto __make_config_from_tuple(const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts);
template <typename _T>
inline constexpr bool __is_kernel_config = false;
template <typename _Dimensions, typename... _Options>
inline constexpr bool __is_kernel_config<kernel_config<_Dimensions, _Options...>> = true;
template <typename _Tp>
_CCCL_CONCEPT __kernel_has_default_config =
_CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)(requires(__is_kernel_config<decltype(__t.default_config())>));
template <typename Dimensions, typename... Options>
struct kernel_config
{
Dimensions dims;
::cuda::std::tuple<Options...> options;
static_assert(::cuda::std::_And<::cuda::std::is_base_of<detail::launch_option, Options>...>::value);
static_assert(detail::no_duplicate_options<Options...>);
constexpr kernel_config(const Dimensions& dims, const Options&... opts)
: dims(dims)
, options(opts...) {};
constexpr kernel_config(const Dimensions& dims, const ::cuda::std::tuple<Options...>& opts)
: dims(dims)
, options(opts) {};
template <typename... NewOptions>
_CCCL_NODISCARD auto add(const NewOptions&... new_options) const
{
return kernel_config<Dimensions, Options..., NewOptions...>(
dims, ::cuda::std::tuple_cat(options, ::cuda::std::make_tuple(new_options...)));
}
template <typename _OtherDimensions, typename... _OtherOptions>
_CCCL_NODISCARD auto combine(const kernel_config<_OtherDimensions, _OtherOptions...>& __other_config) const
{
// can't use fully qualified kernel_config name here because of nvcc bug, TODO remove __make_config_from_tuple once
// fixed
return __make_config_from_tuple(
dims.combine(__other_config.dims),
::cuda::std::tuple_cat(options, ::cuda::std::apply(__filter_options<Options...>{}, __other_config.options)));
}
template <typename _Kernel>
_CCCL_NODISCARD auto combine_with_default(const _Kernel& __kernel) const
{
if constexpr (__kernel_has_default_config<_Kernel>)
{
return combine(__kernel.default_config());
}
else
{
return *this;
}
}
};
// We can consider removing the operator&, but its convenient for in-line construction
template <typename Dimensions, typename... Options, typename NewLevel>
_CUDAX_HOST_API constexpr auto
operator&(const kernel_config<Dimensions, Options...>& config, const NewLevel& new_level) noexcept
{
return kernel_config(hierarchy_add_level(config.dims, new_level), config.options);
}
template <typename NewLevel, typename Dimensions, typename... Options>
_CUDAX_HOST_API constexpr auto
operator&(const NewLevel& new_level, const kernel_config<Dimensions, Options...>& config) noexcept
{
return kernel_config(hierarchy_add_level(config.dims, new_level), config.options);
}
template <typename L1, typename Dims1, typename L2, typename Dims2>
_CUDAX_HOST_API constexpr auto
operator&(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims2>& l2) noexcept
{
return kernel_config(make_hierarchy_fragment(l1, l2));
}
template <typename _Dimensions, typename... _Options>
auto __make_config_from_tuple(const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts)
{
return kernel_config(__dims, __opts);
}
template <typename Dimensions,
typename... Options,
typename Option,
typename = ::cuda::std::enable_if_t<::cuda::std::is_base_of_v<detail::launch_option, Option>>>
_CCCL_NODISCARD constexpr auto
operator&(const kernel_config<Dimensions, Options...>& config, const Option& option) noexcept
{
return config.add(option);
}
template <typename... Levels,
typename Option,
typename = ::cuda::std::enable_if_t<::cuda::std::is_base_of_v<detail::launch_option, Option>>>
_CCCL_NODISCARD constexpr auto operator&(const hierarchy_dimensions<Levels...>& dims, const Option& option) noexcept
{
return kernel_config(dims, option);
}
template <typename BottomUnit, typename... Levels, typename... Opts>
_CCCL_NODISCARD constexpr auto
make_config(const hierarchy_dimensions_fragment<BottomUnit, Levels...>& dims, const Opts&... opts) noexcept
{
return kernel_config<hierarchy_dimensions_fragment<BottomUnit, Levels...>, Opts...>(dims, opts...);
}
template <int _ThreadsPerBlock>
constexpr auto distribute(int numElements) noexcept
{
int blocksPerGrid = (numElements + _ThreadsPerBlock - 1) / _ThreadsPerBlock;
return make_config(make_hierarchy(grid_dims(blocksPerGrid), block_dims<_ThreadsPerBlock>()));
}
template <typename... Prev>
_CCCL_NODISCARD constexpr auto __process_config_args(const ::cuda::std::tuple<Prev...>& previous)
{
if constexpr (sizeof...(Prev) == 0)
{
return kernel_config<__empty_hierarchy>(__empty_hierarchy());
}
else
{
return kernel_config(::cuda::std::apply(make_hierarchy_fragment<void, const Prev&...>, previous));
}
}
template <typename... Prev, typename Arg, typename... Rest>
_CCCL_NODISCARD constexpr auto
__process_config_args(const ::cuda::std::tuple<Prev...>& previous, const Arg& arg, const Rest&... rest)
{
if constexpr (::cuda::std::is_base_of_v<detail::launch_option, Arg>)
{
static_assert((::cuda::std::is_base_of_v<detail::launch_option, Rest> && ...),
"Hierarchy levels and launch options can't be mixed");
if constexpr (sizeof...(Prev) == 0)
{
return kernel_config(__empty_hierarchy(), arg, rest...);
}
else
{
return kernel_config(::cuda::std::apply(make_hierarchy_fragment<void, const Prev&...>, previous), arg, rest...);
}
}
else
{
return __process_config_args(::cuda::std::tuple_cat(previous, ::cuda::std::make_tuple(arg)), rest...);
}
}
template <typename... Args>
_CCCL_NODISCARD constexpr auto make_config(const Args&... args)
{
return __process_config_args(::cuda::std::make_tuple(), args...);
}
namespace detail
{
template <typename Dimensions, typename... Options>
inline unsigned int constexpr kernel_config_count_attr_space(const kernel_config<Dimensions, Options...>&) noexcept
{
return (0 + ... + Options::needs_attribute_space);
}
template <typename Dimensions, typename... Options>
_CCCL_NODISCARD cudaError_t apply_kernel_config(
const kernel_config<Dimensions, Options...>& config, cudaLaunchConfig_t& cuda_config, void* kernel) noexcept
{
cudaError_t status = cudaSuccess;
::cuda::std::apply(
[&](auto&... config_options) {
// Use short-cutting && to skip the rest on error, is this too convoluted?
(void) (... && [&](cudaError_t call_status) {
status = call_status;
return call_status == cudaSuccess;
}(config_options.apply(cuda_config, kernel)));
},
config.options);
return status;
}
// Needs to be a char casted to the appropriate type, if it would be a template
// different instantiations would clash the extern symbol
_CCCL_DEVICE _CCCL_NODISCARD static char* get_smem_ptr() noexcept
{
extern __shared__ char dynamic_smem[];
return &dynamic_smem[0];
}
} // namespace detail
// Might consider cutting this one due to being a potential trap with missing & in auto& var = dynamic_smem_ref(...);
template <typename Dimensions, typename... Options>
_CCCL_DEVICE auto& dynamic_smem_ref(const kernel_config<Dimensions, Options...>& config) noexcept
{
auto& option = detail::find_option_in_tuple<detail::launch_option_kind::dynamic_shared_memory>(config.options);
using option_type = ::cuda::std::remove_reference_t<decltype(option)>;
static_assert(!::cuda::std::is_same_v<option_type, detail::option_not_found>,
"Dynamic shared memory option not found in the kernel configuration");
static_assert(option_type::extent == 1, "Usable only on dynamic shared memory with a single element");
return *reinterpret_cast<typename option_type::content_type*>(detail::get_smem_ptr());
}
template <typename Dimensions, typename... Options>
_CCCL_DEVICE auto dynamic_smem_span(const kernel_config<Dimensions, Options...>& config) noexcept
{
auto& option = detail::find_option_in_tuple<detail::launch_option_kind::dynamic_shared_memory>(config.options);
using option_type = ::cuda::std::remove_reference_t<decltype(option)>;
static_assert(!::cuda::std::is_same_v<option_type, detail::option_not_found>,
"Dynamic shared memory option not found in the kernel configuration");
return cuda::std::span<typename option_type::content_type, option_type::extent>(
reinterpret_cast<typename option_type::content_type*>(detail::get_smem_ptr()), option.size);
}
} // namespace cuda::experimental
#endif // _CCCL_STD_VER >= 2017
#endif // _CUDAX__LAUNCH_CONFIGURATION