cub/util_device.cuh

File members: cub/util_device.cuh
/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cub/util_debug.cuh>
#include <cub/util_policy_wrapper_t.cuh>
#include <cub/util_type.cuh>
// for backward compatibility
#include <cub/util_temporary_storage.cuh>

#include <cuda/std/__cuda/ensure_current_device.h> // IWYU pragma: export
#include <cuda/std/array>
#include <cuda/std/atomic>
#include <cuda/std/cassert>
#include <cuda/std/type_traits>
#include <cuda/std/utility>

#if !_CCCL_COMPILER(NVRTC)
#  if defined(CUB_DEFINE_RUNTIME_POLICIES)
#    include <format>
#    include <string_view>

#    include <nlohmann/json.hpp>
#  endif // defined(CUB_DEFINE_RUNTIME_POLICIES)
#endif // !_CCCL_COMPILER(NVRTC)

#if defined(CUB_ENABLE_POLICY_PTX_JSON)
#  include <cub/detail/ptx-json/json.h>
#endif // defined(CUB_ENABLE_POLICY_PTX_JSON)

#include <nv/target>

CUB_NAMESPACE_BEGIN

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document

namespace detail
{
template <typename T>
CUB_DETAIL_KERNEL_ATTRIBUTES void EmptyKernel()
{}

} // namespace detail

#endif // _CCCL_DOXYGEN_INVOKED

#if !_CCCL_COMPILER(NVRTC)

CUB_RUNTIME_FUNCTION inline int CurrentDevice()
{
  int device = -1;
  if (CubDebug(cudaGetDevice(&device)))
  {
    return -1;
  }
  return device;
}

#  ifndef _CCCL_DOXYGEN_INVOKED // Do not document

using SwitchDevice = ::cuda::__ensure_current_device;

#  endif // _CCCL_DOXYGEN_INVOKED

CUB_RUNTIME_FUNCTION inline int DeviceCountUncached()
{
  int count = -1;
  if (CubDebug(cudaGetDeviceCount(&count)))
  {
    // CUDA makes no guarantees about the state of the output parameter if
    // `cudaGetDeviceCount` fails; in practice, they don't, but out of
    // paranoia we'll reset `count` to `-1`.
    count = -1;
  }
  return count;
}

// Host code. This is a separate function to avoid defining a local static in a host/device function.
_CCCL_HOST inline int DeviceCountCachedValue()
{
  static int count = DeviceCountUncached();
  return count;
}

CUB_RUNTIME_FUNCTION inline int DeviceCount()
{
  int result = -1;

  NV_IF_TARGET(NV_IS_HOST, (result = DeviceCountCachedValue();), (result = DeviceCountUncached();));

  return result;
}

#  ifndef _CCCL_DOXYGEN_INVOKED // Do not document

struct PerDeviceAttributeCache
{
  struct DevicePayload
  {
    int attribute;
    cudaError_t error;
  };

  // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the
  // `DeviceEntryInitializing` state, and then proceeds to the
  // `DeviceEntryReady` state. These are the only state transitions allowed;
  // i.e. a linear sequence of transitions.
  enum DeviceEntryStatus
  {
    DeviceEntryEmpty = 0,
    DeviceEntryInitializing,
    DeviceEntryReady
  };

  struct DeviceEntry
  {
    ::cuda::std::atomic<DeviceEntryStatus> flag;
    DevicePayload payload;
  };

private:
  ::cuda::std::array<DeviceEntry, detail::max_devices> entries_;

public:
  _CCCL_HOST inline PerDeviceAttributeCache()
      : entries_()
  {
    assert(DeviceCount() <= detail::max_devices);
  }

  template <typename Invocable>
  _CCCL_HOST DevicePayload operator()(Invocable&& f, int device)
  {
    if (device >= DeviceCount() || device < 0)
    {
      return DevicePayload{0, cudaErrorInvalidDevice};
    }

    auto& entry   = entries_[device];
    auto& flag    = entry.flag;
    auto& payload = entry.payload;

    DeviceEntryStatus old_status = DeviceEntryEmpty;

    // First, check for the common case of the entry being ready.
    if (flag.load(::cuda::std::memory_order_acquire) != DeviceEntryReady)
    {
      // Assume the entry is empty and attempt to lock it so we can fill
      // it by trying to set the state from `DeviceEntryReady` to
      // `DeviceEntryInitializing`.
      if (flag.compare_exchange_strong(
            old_status, DeviceEntryInitializing, ::cuda::std::memory_order_acq_rel, ::cuda::std::memory_order_acquire))
      {
        // We successfully set the state to `DeviceEntryInitializing`;
        // we have the lock and it's our job to initialize this entry
        // and then release it.

        // We don't use `CubDebug` here because we let the user code
        // decide whether or not errors are hard errors.
        payload.error = ::cuda::std::forward<Invocable>(f)(payload.attribute);
        if (payload.error)
        {
          // Clear the global CUDA error state which may have been
          // set by the last call. Otherwise, errors may "leak" to
          // unrelated kernel launches.
          cudaGetLastError();
        }

        // Release the lock by setting the state to `DeviceEntryReady`.
        flag.store(DeviceEntryReady, ::cuda::std::memory_order_release);
      }

      // If the `compare_exchange_weak` failed, then `old_status` has
      // been updated with the value of `flag` that it observed.

      else if (old_status == DeviceEntryInitializing)
      {
        // Another execution agent is initializing this entry; we need
        // to wait for them to finish; we'll know they're done when we
        // observe the entry status as `DeviceEntryReady`.
        do
        {
          old_status = flag.load(::cuda::std::memory_order_acquire);
        } while (old_status != DeviceEntryReady);
        // FIXME: Use `atomic::wait` instead when we have access to
        // host-side C++20 atomics. We could use libcu++, but it only
        // supports atomics for SM60 and up, even if you're only using
        // them in host code.
      }
    }

    // We now know that the state of our entry is `DeviceEntryReady`, so
    // just return the entry's payload.
    return entry.payload;
  }
};
#  endif // _CCCL_DOXYGEN_INVOKED

CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)
{
  // Instantiate `EmptyKernel<void>` in both host and device code to ensure
  // it can be called.
  using EmptyKernelPtr                         = void (*)();
  [[maybe_unused]] EmptyKernelPtr empty_kernel = detail::EmptyKernel<void>;

  // Define a temporary macro that expands to the current target ptx version
  // in device code.
  // <nv/target> may provide an abstraction for this eventually. For now,
  // we have to keep this usage of __CUDA_ARCH__.
#  if _CCCL_CUDA_COMPILER(NVHPC)
#    define CUB_TEMP_GET_PTX __builtin_current_device_sm()
#  else // ^^^ _CCCL_CUDA_COMPILER(NVHPC) ^^^ / vvv !_CCCL_CUDA_COMPILER(NVHPC) vvv
#    define CUB_TEMP_GET_PTX _CCCL_PTX_ARCH()
#  endif // ^^^ !_CCCL_CUDA_COMPILER(NVHPC) ^^^

  cudaError_t result = cudaSuccess;
  NV_IF_TARGET(
    NV_IS_HOST,
    (cudaFuncAttributes empty_kernel_attrs;

     result = CubDebug(cudaFuncGetAttributes(&empty_kernel_attrs, reinterpret_cast<void*>(empty_kernel)));

     ptx_version = empty_kernel_attrs.ptxVersion * 10;),
    // NV_IS_DEVICE
    (
      // This is necessary to ensure instantiation of EmptyKernel in device
      // code. The `reinterpret_cast` is necessary to suppress a
      // set-but-unused warnings. This is a meme now:
      // https://twitter.com/blelbach/status/1222391615576100864
      (void) reinterpret_cast<EmptyKernelPtr>(empty_kernel);

      ptx_version = CUB_TEMP_GET_PTX;));

#  undef CUB_TEMP_GET_PTX

  return result;
}

_CCCL_HOST inline cudaError_t PtxVersionUncached(int& ptx_version, int device)
{
  [[maybe_unused]] SwitchDevice sd(device);
  return PtxVersionUncached(ptx_version);
}

template <typename Tag>
_CCCL_HOST inline PerDeviceAttributeCache& GetPerDeviceAttributeCache()
{
  static PerDeviceAttributeCache cache;
  return cache;
}

struct PtxVersionCacheTag
{};
struct SmVersionCacheTag
{};

_CCCL_HOST inline cudaError_t PtxVersion(int& ptx_version, int device)
{
  // Note: the ChainedPolicy pruning (i.e., invoke_static) requites that there's an exact match between one of the
  // architectures in __CUDA_ARCH__ and the runtime queried ptx version.
  auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
    // If this call fails, then we get the error code back in the payload, which we check with `CubDebug` below.
    [=](int& pv) {
      return PtxVersionUncached(pv, device);
    },
    device);

  if (!CubDebug(payload.error))
  {
    ptx_version = payload.attribute;
  }

  return payload.error;
}

CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version)
{
  // Note: the ChainedPolicy pruning (i.e., invoke_static) requites that there's an exact match between one of the
  // architectures in __CUDA_ARCH__ and the runtime queried ptx version.
  cudaError_t result = cudaErrorUnknown;
  NV_IF_TARGET(NV_IS_HOST,
               (result = PtxVersion(ptx_version, CurrentDevice());),
               ( // NV_IS_DEVICE:
                 result = PtxVersionUncached(ptx_version);));
  return result;
}

CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
{
  cudaError_t error = cudaSuccess;
  do
  {
    int major = 0, minor = 0;
    error = CubDebug(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
    if (cudaSuccess != error)
    {
      break;
    }

    error = CubDebug(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
    if (cudaSuccess != error)
    {
      break;
    }
    sm_version = major * 100 + minor * 10;
  } while (0);

  return error;
}

CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
{
  cudaError_t result = cudaErrorUnknown;

  NV_IF_TARGET(
    NV_IS_HOST,
    (auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
       // If this call fails, then we get the error code back in the payload, which we check with `CubDebug` below.
       [=](int& pv) {
         return SmVersionUncached(pv, device);
       },
       device);

     if (!CubDebug(payload.error)) { sm_version = payload.attribute; };

     result = payload.error;),
    ( // NV_IS_DEVICE
      result = SmVersionUncached(sm_version, device);));

  return result;
}

CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream([[maybe_unused]] cudaStream_t stream)
{
  NV_IF_TARGET(NV_IS_HOST, (return CubDebug(cudaStreamSynchronize(stream));), (return cudaErrorNotSupported;))
}

namespace detail
{
CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream([[maybe_unused]] cudaStream_t stream)
{
#  ifdef CUB_DEBUG_SYNC
  NV_IF_TARGET(NV_IS_HOST,
               (_CubLog("%s", "Synchronizing...\n"); return SyncStream(stream);),
               (_CubLog("%s", "WARNING: Skipping CUB debug synchronization in device code"); return cudaSuccess;));
#  else // ^^^ CUB_DEBUG_SYNC / !CUB_DEBUG_SYNC vvv
  return cudaSuccess;
#  endif // ^^^ !CUB_DEBUG_SYNC ^^^
}

CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
{
  has_uva           = false;
  int device        = -1;
  cudaError_t error = CubDebug(cudaGetDevice(&device));
  if (cudaSuccess != error)
  {
    return error;
  }

  int uva = 0;
  error   = CubDebug(cudaDeviceGetAttribute(&uva, cudaDevAttrUnifiedAddressing, device));
  if (cudaSuccess != error)
  {
    return error;
  }
  has_uva = uva == 1;
  return error;
}

} // namespace detail

template <typename KernelPtr>
_CCCL_VISIBILITY_HIDDEN CUB_RUNTIME_FUNCTION inline cudaError_t
MaxSmOccupancy(int& max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes = 0)
{
  return CubDebug(
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_sm_occupancy, kernel_ptr, block_threads, dynamic_smem_bytes));
}

#endif // !_CCCL_COMPILER(NVRTC)

/******************************************************************************
 * Policy management
 ******************************************************************************/
// PolicyWrapper

namespace detail
{

#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
#  if !_CCCL_HAS_CONCEPTS()
#    error Generation of runtime policy wrappers and/or policy PTX JSON information requires C++20 concepts.
#  endif // !_CCCL_HAS_CONCEPTS()
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)

#define CUB_DETAIL_POLICY_WRAPPER_CONCEPT_TEST(field)     , StaticPolicyT::_CCCL_PP_FIRST field
#define CUB_DETAIL_POLICY_WRAPPER_REFINE_CONCEPT(concept) concept<StaticPolicyT>&&

#define CUB_DETAIL_POLICY_WRAPPER_ACCESSOR(field)                   \
  __host__ __device__ static constexpr auto _CCCL_PP_SECOND field() \
  {                                                                 \
    return StaticPolicyT::_CCCL_PP_FIRST field;                     \
  }

#if defined(CUB_ENABLE_POLICY_PTX_JSON)
#  define CUB_DETAIL_POLICY_WRAPPER_ENCODED_FIELD(field) \
    key<_CCCL_TO_STRING(_CCCL_PP_FIRST field)>() = value<(int) StaticPolicyT::_CCCL_PP_FIRST field>(),

#  define CUB_DETAIL_POLICY_WRAPPER_ENCODED_POLICY(...)                                     \
    _CCCL_DEVICE static constexpr auto EncodedPolicy()                                      \
    {                                                                                       \
      using namespace ptx_json;                                                             \
      return object<_CCCL_PP_FOR_EACH(CUB_DETAIL_POLICY_WRAPPER_ENCODED_FIELD, __VA_ARGS__) \
                      key<"__dummy">() = value<0>()>();                                     \
    }
#else
#  define CUB_DETAIL_POLICY_WRAPPER_ENCODED_POLICY(...)
#endif // defined(CUB_ENABLE_POLICY_PTX_JSON)

#if defined(CUB_DEFINE_RUNTIME_POLICIES)
#  define CUB_DETAIL_POLICY_WRAPPER_FIELD(field)                       \
    _CCCL_PP_THIRD field _CCCL_PP_CAT(runtime_, _CCCL_PP_FIRST field); \
    _CCCL_PP_THIRD field _CCCL_PP_SECOND field() const                 \
    {                                                                  \
      return _CCCL_PP_CAT(runtime_, _CCCL_PP_FIRST field);             \
    }

#  define CUB_DETAIL_POLICY_WRAPPER_GET_FIELD(field)  \
    ap._CCCL_PP_CAT(runtime_, _CCCL_PP_FIRST field) = \
      static_cast<_CCCL_PP_THIRD field>(subpolicy[_CCCL_TO_STRING(_CCCL_PP_FIRST field)].get<int>());

#  define CUB_DETAIL_POLICY_WRAPPER_FIELD_STRING(field) \
    _CCCL_TO_STRING(static constexpr auto _CCCL_PP_FIRST field = static_cast<_CCCL_PP_THIRD field>({});) "\n"

#  define CUB_DETAIL_POLICY_WRAPPER_FIELD_VALUE(field) , (int) ap._CCCL_PP_CAT(runtime_, _CCCL_PP_FIRST field)

#  define CUB_DETAIL_POLICY_WRAPPER_AGENT_POLICY(concept_name, ...)                                                    \
    struct Runtime##concept_name                                                                                       \
    {                                                                                                                  \
      _CCCL_PP_FOR_EACH(CUB_DETAIL_POLICY_WRAPPER_FIELD, __VA_ARGS__)                                                  \
      static std::pair<Runtime##concept_name, std::string>                                                             \
      from_json(const nlohmann::json& json, std::string_view subpolicy_name)                                           \
      {                                                                                                                \
        auto subpolicy = json[subpolicy_name];                                                                         \
        assert(!subpolicy.is_null());                                                                                  \
        Runtime##concept_name ap;                                                                                      \
        _CCCL_PP_FOR_EACH(CUB_DETAIL_POLICY_WRAPPER_GET_FIELD, __VA_ARGS__)                                            \
        return std::make_pair(                                                                                         \
          ap,                                                                                                          \
          std::format("struct {} {{\n" _CCCL_PP_FOR_EACH(CUB_DETAIL_POLICY_WRAPPER_FIELD_STRING, __VA_ARGS__) "}};\n", \
                      subpolicy_name _CCCL_PP_FOR_EACH(CUB_DETAIL_POLICY_WRAPPER_FIELD_VALUE, __VA_ARGS__)));          \
      }                                                                                                                \
    };
#else
#  define CUB_DETAIL_POLICY_WRAPPER_AGENT_POLICY(...)
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES)

template <typename T>
_CCCL_CONCEPT always_true = true;

#define CUB_DETAIL_POLICY_WRAPPER_DEFINE(concept_name, refines, ...)                                                   \
  template <typename StaticPolicyT>                                                                                    \
  _CCCL_CONCEPT concept_name = _CCCL_PP_FOR_EACH(CUB_DETAIL_POLICY_WRAPPER_REFINE_CONCEPT, _CCCL_PP_EXPAND refines)    \
    _CCCL_REQUIRES_EXPR((StaticPolicyT))(true _CCCL_PP_FOR_EACH(CUB_DETAIL_POLICY_WRAPPER_CONCEPT_TEST, __VA_ARGS__)); \
  template <typename StaticPolicyT>                                                                                    \
  struct concept_name##Wrapper : StaticPolicyT                                                                         \
  {                                                                                                                    \
    __host__ __device__ constexpr concept_name##Wrapper(StaticPolicyT base)                                            \
        : StaticPolicyT(base)                                                                                          \
    {}                                                                                                                 \
    _CCCL_PP_FOR_EACH(CUB_DETAIL_POLICY_WRAPPER_ACCESSOR, __VA_ARGS__)                                                 \
    CUB_DETAIL_POLICY_WRAPPER_ENCODED_POLICY(__VA_ARGS__)                                                              \
  };                                                                                                                   \
  _CCCL_TEMPLATE(typename StaticPolicyT)                                                                               \
  _CCCL_REQUIRES(concept_name<StaticPolicyT>)                                                                          \
  __host__ __device__ constexpr concept_name##Wrapper<StaticPolicyT> MakePolicyWrapper(StaticPolicyT policy)           \
  {                                                                                                                    \
    return concept_name##Wrapper{policy};                                                                              \
  }                                                                                                                    \
  CUB_DETAIL_POLICY_WRAPPER_AGENT_POLICY(concept_name, __VA_ARGS__)

// Generic agent policy
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
  GenericAgentPolicy, (always_true), (BLOCK_THREADS, BlockThreads, int), (ITEMS_PER_THREAD, ItemsPerThread, int) )

_CCCL_TEMPLATE(typename PolicyT)
_CCCL_REQUIRES((!GenericAgentPolicy<PolicyT>) )
__host__ __device__ constexpr PolicyT MakePolicyWrapper(PolicyT policy)
{
  return policy;
}

} // namespace detail

//----------------------------------------------------------------------------------------------------------------------
// ChainedPolicy

#if !_CCCL_COMPILER(NVRTC)

namespace detail
{

// Forward declaration of the default kernel launcher factory
struct TripleChevronFactory;

// By default, CUB uses `cub::detail::TripleChevronFactory` to access the CUDA runtime.
// The `CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY` indirection is used to override the default kernel launcher factory
// in CUB tests. This allows us to:
//   1. retrieve kernel pointers on the usage side of the API, and
//   2. validate use of specified CUDA stream by accelerated algorithms.
#  ifndef CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY
#    define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY cub::detail::TripleChevronFactory
#  endif

struct KernelConfig
{
  int block_threads{0};
  int items_per_thread{0};
  int tile_size{0};
  int sm_occupancy{0};

  template <typename AgentPolicyT,
            typename KernelPtrT,
            typename LauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
  Init(KernelPtrT kernel_ptr, AgentPolicyT agent_policy = {}, LauncherFactory launcher_factory = {})
  {
    block_threads    = cub::detail::MakePolicyWrapper(agent_policy).BlockThreads();
    items_per_thread = cub::detail::MakePolicyWrapper(agent_policy).ItemsPerThread();
    tile_size        = block_threads * items_per_thread;
    return launcher_factory.MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
  }
};

} // namespace detail
#endif // !_CCCL_COMPILER(NVRTC)

template <int PolicyPtxVersion, typename PolicyT, typename PrevPolicyT>
struct ChainedPolicy
{
  using ActivePolicy = ::cuda::std::_If<(CUB_PTX_ARCH < PolicyPtxVersion), typename PrevPolicyT::ActivePolicy, PolicyT>;

#if !_CCCL_COMPILER(NVRTC)
  template <typename FunctorT>
  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int device_ptx_version, FunctorT& op)
  {
    // __CUDA_ARCH_LIST__ is only available from CTK 11.5 onwards
#  ifdef __CUDA_ARCH_LIST__
    return runtime_to_compiletime<1, __CUDA_ARCH_LIST__>(device_ptx_version, op);
    // NV_TARGET_SM_INTEGER_LIST is defined by NVHPC. The values need to be multiplied by 10 to match
    // __CUDA_ARCH_LIST__. E.g. arch 860 from __CUDA_ARCH_LIST__ corresponds to arch 86 from NV_TARGET_SM_INTEGER_LIST.
#  elif defined(NV_TARGET_SM_INTEGER_LIST)
    return runtime_to_compiletime<10, NV_TARGET_SM_INTEGER_LIST>(device_ptx_version, op);
#  else
    if (device_ptx_version < PolicyPtxVersion)
    {
      return PrevPolicyT::Invoke(device_ptx_version, op);
    }
    return op.template Invoke<PolicyT>();
#  endif
  }
#endif // !_CCCL_COMPILER(NVRTC)

private:
  template <int, typename, typename>
  friend struct ChainedPolicy; // let us call invoke_static of other ChainedPolicy instantiations

#if !_CCCL_COMPILER(NVRTC)
  template <int ArchMult, int... CudaArches, typename FunctorT>
  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t runtime_to_compiletime(int device_ptx_version, FunctorT& op)
  {
    // We instantiate invoke_static for each CudaArches, but only call the one matching device_ptx_version.
    // If there's no exact match of the architectures in __CUDA_ARCH_LIST__/NV_TARGET_SM_INTEGER_LIST and the runtime
    // queried ptx version (i.e., the closest ptx version to the current device's architecture that the EmptyKernel was
    // compiled for), we return cudaErrorInvalidDeviceFunction. Such a scenario may arise if CUB_DISABLE_NAMESPACE_MAGIC
    // is set and different TUs are compiled for different sets of architecture.
    cudaError_t e = cudaErrorInvalidDeviceFunction;
    (..., (device_ptx_version == CudaArches * ArchMult ? (e = invoke_static<CudaArches * ArchMult>(op)) : cudaSuccess));
    return e;
  }

  template <int DevicePtxVersion, typename FunctorT>
  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op)
  {
    if constexpr (DevicePtxVersion < PolicyPtxVersion)
    {
      return PrevPolicyT::template invoke_static<DevicePtxVersion>(op);
    }
    else
    {
      return op.template Invoke<PolicyT>();
    }
  }
#endif // !_CCCL_COMPILER(NVRTC)
};

template <int PolicyPtxVersion, typename PolicyT>
struct ChainedPolicy<PolicyPtxVersion, PolicyT, PolicyT>
{
  template <int, typename, typename>
  friend struct ChainedPolicy; // befriend primary template, so it can call invoke_static

  using ActivePolicy = PolicyT;

#if !_CCCL_COMPILER(NVRTC)
  template <typename FunctorT>
  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op)
  {
    return op.template Invoke<PolicyT>();
  }

private:
  template <int, typename FunctorT>
  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op)
  {
    return op.template Invoke<PolicyT>();
  }
#endif // !_CCCL_COMPILER(NVRTC)
};

CUB_NAMESPACE_END

#if _CCCL_HAS_CUDA_COMPILER() && !_CCCL_COMPILER(NVRTC)
#  include <cub/detail/launcher/cuda_runtime.cuh> // to complete the definition of TripleChevronFactory
#endif // _CCCL_HAS_CUDA_COMPILER() && !_CCCL_COMPILER(NVRTC)