include/cuda/experimental/__stf/utility/occupancy.cuh
File members: include/cuda/experimental/__stf/utility/occupancy.cuh
//===----------------------------------------------------------------------===//
//
// Part of CUDASTF in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
#pragma once
#include <cuda/__cccl_config>
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header
#include <cuda/experimental/__stf/utility/cuda_safe_call.cuh>
#include <cuda/experimental/__stf/utility/hash.cuh>
namespace cuda::experimental::stf
{
namespace reserved
{
struct compute_occupancy_result
{
int min_grid_size;
int block_size;
};
template <typename Kernel>
compute_occupancy_result compute_occupancy(Kernel&& f, size_t dynamicSMemSize = 0, int blockSizeLimit = 0)
{
using key_t = ::std::pair<size_t /*dynamicSMemSize*/, int /*blockSizeLimit*/>;
static ::std::unordered_map<key_t, compute_occupancy_result, ::cuda::experimental::stf::hash<key_t>> occupancy_cache;
const auto key = ::std::make_pair(dynamicSMemSize, blockSizeLimit);
if (auto i = occupancy_cache.find(key); i != occupancy_cache.end())
{
// Cache hit
return i->second;
}
// Miss
auto& result = occupancy_cache[key];
if constexpr (::std::is_same_v<::std::decay_t<Kernel>, CUfunction>)
{
cuda_safe_call(cuOccupancyMaxPotentialBlockSize(
&result.min_grid_size, &result.block_size, f, nullptr, dynamicSMemSize, blockSizeLimit));
}
else
{
cuda_safe_call(cudaOccupancyMaxPotentialBlockSize(
&result.min_grid_size, &result.block_size, f, dynamicSMemSize, blockSizeLimit));
}
return result;
}
struct cuda_kernel_limits_result
{
int min_grid_size;
int max_block_size;
int block_size_limit;
};
template <typename Fun>
cuda_kernel_limits_result compute_kernel_limits(const Fun&& f, size_t shared_mem_bytes, bool cooperative)
{
static_assert(::std::is_function<typename ::std::remove_pointer<Fun>::type>::value,
"Template parameter Fun must be a pointer to a function type.");
cuda_kernel_limits_result res;
auto occupancy_res = compute_occupancy(f, shared_mem_bytes);
res.min_grid_size = occupancy_res.min_grid_size;
if (cooperative)
{
// For cooperative kernels, the number of blocks is limited. We compute the number of SM on device 0 and assume
// we have a homogeneous machine.
static const int sm_count = cuda_try<cudaDeviceGetAttribute>(cudaDevAttrMultiProcessorCount, 0);
// TODO there could be more than 1 block per SM, but we do not know the actual block sizes for now ...
res.min_grid_size = ::std::min(res.min_grid_size, sm_count);
}
res.max_block_size = occupancy_res.block_size;
/* Compute the maximum block size (not the optimal size) */
cudaFuncAttributes attrs;
cuda_safe_call(cudaFuncGetAttributes(&attrs, f));
res.block_size_limit = attrs.maxThreadsPerBlock;
return res;
}
} // end namespace reserved
} // namespace cuda::experimental::stf