cub/util_device.cuh
File members: cub/util_device.cuh
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#pragma once
#include <cub/config.cuh>
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header
#include <cub/detail/device_synchronize.cuh> // IWYU pragma: export
#include <cub/util_debug.cuh>
#include <cub/util_type.cuh>
// for backward compatibility
#include <cub/util_temporary_storage.cuh>
#include <cuda/std/__cuda/ensure_current_device.h> // IWYU pragma: export
#include <cuda/std/type_traits>
#include <cuda/std/utility>
#include <array>
#include <atomic>
#include <cassert>
#include <nv/target>
CUB_NAMESPACE_BEGIN
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
namespace detail
{
// TODO(bgruber): this should be called something like "override_policy"
template <typename PolicyT, int BLOCK_THREADS_, int ITEMS_PER_THREAD_ = PolicyT::ITEMS_PER_THREAD>
struct policy_wrapper_t : PolicyT
{
static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_;
static constexpr int BLOCK_THREADS = BLOCK_THREADS_;
static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;
};
} // namespace detail
template <typename T>
CUB_DETAIL_KERNEL_ATTRIBUTES void EmptyKernel()
{}
#endif // _CCCL_DOXYGEN_INVOKED
CUB_RUNTIME_FUNCTION inline int CurrentDevice()
{
int device = -1;
if (CubDebug(cudaGetDevice(&device)))
{
return -1;
}
return device;
}
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
using SwitchDevice = ::cuda::__ensure_current_device;
#endif // _CCCL_DOXYGEN_INVOKED
CUB_RUNTIME_FUNCTION inline int DeviceCountUncached()
{
int count = -1;
if (CubDebug(cudaGetDeviceCount(&count)))
{
// CUDA makes no guarantees about the state of the output parameter if
// `cudaGetDeviceCount` fails; in practice, they don't, but out of
// paranoia we'll reset `count` to `-1`.
count = -1;
}
return count;
}
template <typename T, T (*Function)()>
struct CUB_DEPRECATED ValueCache
{
T const value;
_CCCL_HOST inline ValueCache()
: value(Function())
{}
};
// Host code. This is a separate function to avoid defining a local static in a host/device function.
_CCCL_HOST inline int DeviceCountCachedValue()
{
static int count = DeviceCountUncached();
return count;
}
CUB_RUNTIME_FUNCTION inline int DeviceCount()
{
int result = -1;
NV_IF_TARGET(NV_IS_HOST, (result = DeviceCountCachedValue();), (result = DeviceCountUncached();));
return result;
}
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
struct PerDeviceAttributeCache
{
struct DevicePayload
{
int attribute;
cudaError_t error;
};
// Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the
// `DeviceEntryInitializing` state, and then proceeds to the
// `DeviceEntryReady` state. These are the only state transitions allowed;
// i.e. a linear sequence of transitions.
enum DeviceEntryStatus
{
DeviceEntryEmpty = 0,
DeviceEntryInitializing,
DeviceEntryReady
};
struct DeviceEntry
{
std::atomic<DeviceEntryStatus> flag;
DevicePayload payload;
};
private:
std::array<DeviceEntry, CUB_MAX_DEVICES> entries_;
public:
_CCCL_HOST inline PerDeviceAttributeCache()
: entries_()
{
assert(DeviceCount() <= CUB_MAX_DEVICES);
}
template <typename Invocable>
_CCCL_HOST DevicePayload operator()(Invocable&& f, int device)
{
if (device >= DeviceCount() || device < 0)
{
return DevicePayload{0, cudaErrorInvalidDevice};
}
auto& entry = entries_[device];
auto& flag = entry.flag;
auto& payload = entry.payload;
DeviceEntryStatus old_status = DeviceEntryEmpty;
// First, check for the common case of the entry being ready.
if (flag.load(std::memory_order_acquire) != DeviceEntryReady)
{
// Assume the entry is empty and attempt to lock it so we can fill
// it by trying to set the state from `DeviceEntryReady` to
// `DeviceEntryInitializing`.
if (flag.compare_exchange_strong(
old_status, DeviceEntryInitializing, std::memory_order_acq_rel, std::memory_order_acquire))
{
// We successfully set the state to `DeviceEntryInitializing`;
// we have the lock and it's our job to initialize this entry
// and then release it.
// We don't use `CubDebug` here because we let the user code
// decide whether or not errors are hard errors.
payload.error = ::cuda::std::forward<Invocable>(f)(payload.attribute);
if (payload.error)
{
// Clear the global CUDA error state which may have been
// set by the last call. Otherwise, errors may "leak" to
// unrelated kernel launches.
cudaGetLastError();
}
// Release the lock by setting the state to `DeviceEntryReady`.
flag.store(DeviceEntryReady, std::memory_order_release);
}
// If the `compare_exchange_weak` failed, then `old_status` has
// been updated with the value of `flag` that it observed.
else if (old_status == DeviceEntryInitializing)
{
// Another execution agent is initializing this entry; we need
// to wait for them to finish; we'll know they're done when we
// observe the entry status as `DeviceEntryReady`.
do
{
old_status = flag.load(std::memory_order_acquire);
} while (old_status != DeviceEntryReady);
// FIXME: Use `atomic::wait` instead when we have access to
// host-side C++20 atomics. We could use libcu++, but it only
// supports atomics for SM60 and up, even if you're only using
// them in host code.
}
}
// We now know that the state of our entry is `DeviceEntryReady`, so
// just return the entry's payload.
return entry.payload;
}
};
#endif // _CCCL_DOXYGEN_INVOKED
CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)
{
// Instantiate `EmptyKernel<void>` in both host and device code to ensure
// it can be called.
using EmptyKernelPtr = void (*)();
EmptyKernelPtr empty_kernel = EmptyKernel<void>;
// This is necessary for unused variable warnings in host compilers. The
// usual syntax of (void)empty_kernel; was not sufficient on MSVC2015.
(void) reinterpret_cast<void*>(empty_kernel);
// Define a temporary macro that expands to the current target ptx version
// in device code.
// <nv/target> may provide an abstraction for this eventually. For now,
// we have to keep this usage of __CUDA_ARCH__.
#if defined(_NVHPC_CUDA)
# define CUB_TEMP_GET_PTX __builtin_current_device_sm()
#else
# define CUB_TEMP_GET_PTX __CUDA_ARCH__
#endif
cudaError_t result = cudaSuccess;
NV_IF_TARGET(
NV_IS_HOST,
(cudaFuncAttributes empty_kernel_attrs;
result = CubDebug(cudaFuncGetAttributes(&empty_kernel_attrs, reinterpret_cast<void*>(empty_kernel)));
ptx_version = empty_kernel_attrs.ptxVersion * 10;),
// NV_IS_DEVICE
(
// This is necessary to ensure instantiation of EmptyKernel in device
// code. The `reinterpret_cast` is necessary to suppress a
// set-but-unused warnings. This is a meme now:
// https://twitter.com/blelbach/status/1222391615576100864
(void) reinterpret_cast<EmptyKernelPtr>(empty_kernel);
ptx_version = CUB_TEMP_GET_PTX;));
#undef CUB_TEMP_GET_PTX
return result;
}
_CCCL_HOST inline cudaError_t PtxVersionUncached(int& ptx_version, int device)
{
SwitchDevice sd(device);
(void) sd;
return PtxVersionUncached(ptx_version);
}
template <typename Tag>
_CCCL_HOST inline PerDeviceAttributeCache& GetPerDeviceAttributeCache()
{
static PerDeviceAttributeCache cache;
return cache;
}
struct PtxVersionCacheTag
{};
struct SmVersionCacheTag
{};
_CCCL_HOST inline cudaError_t PtxVersion(int& ptx_version, int device)
{
// Note: the ChainedPolicy pruning (i.e., invoke_static) requites that there's an exact match between one of the
// architectures in __CUDA_ARCH__ and the runtime queried ptx version.
auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
// If this call fails, then we get the error code back in the payload, which we check with `CubDebug` below.
[=](int& pv) {
return PtxVersionUncached(pv, device);
},
device);
if (!CubDebug(payload.error))
{
ptx_version = payload.attribute;
}
return payload.error;
}
CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version)
{
// Note: the ChainedPolicy pruning (i.e., invoke_static) requites that there's an exact match between one of the
// architectures in __CUDA_ARCH__ and the runtime queried ptx version.
cudaError_t result = cudaErrorUnknown;
NV_IF_TARGET(NV_IS_HOST,
(result = PtxVersion(ptx_version, CurrentDevice());),
( // NV_IS_DEVICE:
result = PtxVersionUncached(ptx_version);));
return result;
}
CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
{
cudaError_t error = cudaSuccess;
do
{
int major = 0, minor = 0;
error = CubDebug(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
if (cudaSuccess != error)
{
break;
}
error = CubDebug(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
if (cudaSuccess != error)
{
break;
}
sm_version = major * 100 + minor * 10;
} while (0);
return error;
}
CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
{
cudaError_t result = cudaErrorUnknown;
NV_IF_TARGET(
NV_IS_HOST,
(auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
// If this call fails, then we get the error code back in the payload, which we check with `CubDebug` below.
[=](int& pv) {
return SmVersionUncached(pv, device);
},
device);
if (!CubDebug(payload.error)) { sm_version = payload.attribute; };
result = payload.error;),
( // NV_IS_DEVICE
result = SmVersionUncached(sm_version, device);));
return result;
}
CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream)
{
cudaError_t result = cudaErrorNotSupported;
NV_IF_TARGET(NV_IS_HOST,
(result = CubDebug(cudaStreamSynchronize(stream));),
((void) stream; result = CubDebug(cub::detail::device_synchronize());));
return result;
}
namespace detail
{
CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream)
{
#ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC
(void) stream;
return cudaSuccess;
#else // CUB_DETAIL_DEBUG_ENABLE_SYNC:
# define CUB_TMP_SYNC_AVAILABLE \
_CubLog("%s\n", "Synchronizing..."); \
return SyncStream(stream)
# define CUB_TMP_DEVICE_SYNC_UNAVAILABLE \
(void) stream; \
_CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \
"device-side sync requires <sm_90, RDC, and CDPv1"); \
return cudaSuccess
# ifdef CUB_DETAIL_CDPv1
// Can sync everywhere but SM_90+
NV_IF_TARGET(NV_PROVIDES_SM_90, (CUB_TMP_DEVICE_SYNC_UNAVAILABLE;), (CUB_TMP_SYNC_AVAILABLE;));
# else // CDPv2 or no CDP:
// Can only sync on host
NV_IF_TARGET(NV_IS_HOST, (CUB_TMP_SYNC_AVAILABLE;), (CUB_TMP_DEVICE_SYNC_UNAVAILABLE;));
# endif // CDP version
# undef CUB_TMP_DEVICE_SYNC_UNAVAILABLE
# undef CUB_TMP_SYNC_AVAILABLE
#endif // CUB_DETAIL_DEBUG_ENABLE_SYNC
}
CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
{
has_uva = false;
int device = -1;
cudaError_t error = CubDebug(cudaGetDevice(&device));
if (cudaSuccess != error)
{
return error;
}
int uva = 0;
error = CubDebug(cudaDeviceGetAttribute(&uva, cudaDevAttrUnifiedAddressing, device));
if (cudaSuccess != error)
{
return error;
}
has_uva = uva == 1;
return error;
}
} // namespace detail
template <typename KernelPtr>
_CCCL_VISIBILITY_HIDDEN CUB_RUNTIME_FUNCTION inline cudaError_t
MaxSmOccupancy(int& max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes = 0)
{
return CubDebug(
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_sm_occupancy, kernel_ptr, block_threads, dynamic_smem_bytes));
}
/******************************************************************************
* Policy management
******************************************************************************/
template <typename PolicyT, typename = void>
struct PolicyWrapper : PolicyT
{
CUB_RUNTIME_FUNCTION PolicyWrapper(PolicyT base)
: PolicyT(base)
{}
};
template <typename StaticPolicyT>
struct PolicyWrapper<
StaticPolicyT,
_CUDA_VSTD::void_t<decltype(StaticPolicyT::BLOCK_THREADS), decltype(StaticPolicyT::ITEMS_PER_THREAD)>> : StaticPolicyT
{
CUB_RUNTIME_FUNCTION PolicyWrapper(StaticPolicyT base)
: StaticPolicyT(base)
{}
CUB_RUNTIME_FUNCTION static constexpr int BlockThreads()
{
return StaticPolicyT::BLOCK_THREADS;
}
CUB_RUNTIME_FUNCTION static constexpr int ItemsPerThread()
{
return StaticPolicyT::ITEMS_PER_THREAD;
}
};
template <typename PolicyT>
CUB_RUNTIME_FUNCTION PolicyWrapper<PolicyT> MakePolicyWrapper(PolicyT policy)
{
return PolicyWrapper<PolicyT>{policy};
}
struct TripleChevronFactory;
struct KernelConfig
{
int block_threads;
int items_per_thread;
int tile_size;
int sm_occupancy;
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE KernelConfig()
: block_threads(0)
, items_per_thread(0)
, tile_size(0)
, sm_occupancy(0)
{}
template <typename AgentPolicyT, typename KernelPtrT, typename LauncherFactory = TripleChevronFactory>
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
Init(KernelPtrT kernel_ptr, AgentPolicyT agent_policy = {}, LauncherFactory launcher_factory = {})
{
block_threads = MakePolicyWrapper(agent_policy).BlockThreads();
items_per_thread = MakePolicyWrapper(agent_policy).ItemsPerThread();
tile_size = block_threads * items_per_thread;
return launcher_factory.MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
}
};
template <int PolicyPtxVersion, typename PolicyT, typename PrevPolicyT>
struct ChainedPolicy
{
using ActivePolicy = ::cuda::std::_If<(CUB_PTX_ARCH < PolicyPtxVersion), typename PrevPolicyT::ActivePolicy, PolicyT>;
template <typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int device_ptx_version, FunctorT& op)
{
// __CUDA_ARCH_LIST__ is only available from CTK 11.5 onwards
#ifdef __CUDA_ARCH_LIST__
return runtime_to_compiletime<__CUDA_ARCH_LIST__>(device_ptx_version, op);
#else
if (device_ptx_version < PolicyPtxVersion)
{
return PrevPolicyT::Invoke(device_ptx_version, op);
}
return op.template Invoke<PolicyT>();
#endif
}
private:
template <int, typename, typename>
friend struct ChainedPolicy; // let us call invoke_static of other ChainedPolicy instantiations
template <int... CudaArches, typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t runtime_to_compiletime(int device_ptx_version, FunctorT& op)
{
// We instantiate invoke_static for each CudaArches, but only call the one matching device_ptx_version.
// If there's no exact match of any of the architectures in __CUDA_ARCH_LIST__ and the runtime
// queried ptx version (i.e., the closest ptx version to the current device's architecture that the EmptyKernel was
// compiled for), we return cudaErrorInvalidDeviceFunction. Such a scenario may arise if CUB_DISABLE_NAMESPACE_MAGIC
// is set and different TUs are compiled for different sets of architecture.
cudaError_t e = cudaErrorInvalidDeviceFunction;
const cudaError_t dummy[] = {
(device_ptx_version == CudaArches ? (e = invoke_static<CudaArches>(op, ::cuda::std::true_type{}))
: cudaSuccess)...};
(void) dummy;
return e;
}
template <int DevicePtxVersion, typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op, ::cuda::std::true_type)
{
// TODO(bgruber): drop diagnostic suppression in C++17
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_MSVC(4127) // suppress Conditional Expression is Constant
_CCCL_IF_CONSTEXPR (DevicePtxVersion < PolicyPtxVersion)
{
// TODO(bgruber): drop boolean tag dispatches in C++17, since _CCCL_IF_CONSTEXPR will discard this branch properly
return PrevPolicyT::template invoke_static<DevicePtxVersion>(
op, ::cuda::std::bool_constant<(DevicePtxVersion < PolicyPtxVersion)>{});
}
else
{
return do_invoke(op, ::cuda::std::bool_constant<DevicePtxVersion >= PolicyPtxVersion>{});
}
_CCCL_DIAG_POP
}
template <int, typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT&, ::cuda::std::false_type)
{
_CCCL_UNREACHABLE();
}
template <typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t do_invoke(FunctorT& op, ::cuda::std::true_type)
{
return op.template Invoke<PolicyT>();
}
template <typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t do_invoke(FunctorT&, ::cuda::std::false_type)
{
_CCCL_UNREACHABLE();
}
};
template <int PolicyPtxVersion, typename PolicyT>
struct ChainedPolicy<PolicyPtxVersion, PolicyT, PolicyT>
{
template <int, typename, typename>
friend struct ChainedPolicy; // befriend primary template, so it can call invoke_static
using ActivePolicy = PolicyT;
template <typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op)
{
return op.template Invoke<PolicyT>();
}
private:
template <int, typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op, ::cuda::std::true_type)
{
return op.template Invoke<PolicyT>();
}
template <int, typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT&, ::cuda::std::false_type)
{
_CCCL_UNREACHABLE();
}
};
CUB_NAMESPACE_END
#include <cub/launcher/cuda_runtime.cuh> // to complete the definition of TripleChevronFactory