Math
template <typename T>
[[nodiscard]] __host__ __device__ constexpr T ceil_div(T a, T b) noexcept;
ceil_div
_Requires_: is_integral_v<T> is true.
_Preconditions_: a >= 0 is true and b > 0 is true.
_Returns_: divides a by b. If a is not a multiple of b rounds the result up to the next integer value.
Note
The function is only constexpr from C++14 onwards
Example: This API is very useful for determining the number of thread blocks required to process a fixed amount of work, given a fixed number of threads per block:
#include <vector>
#include <cuda/cmath>
__global__ void vscale(int n, float s, float *x) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) x[i] *= s;
}
int main() {
const int n = 100000;
const float s = 2.f;
std::vector<float> x(n, 1.f);
// Given a fixed number of threads per block...
constexpr int threads_per_block = 256;
// ...dividing some "n" by "threads_per_block" may lead to a remainder,
// requiring the kernel to be launched with an extra thread block to handle it.
const int thread_blocks = cuda::ceil_div(n, threads_per_block);
vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data());
cudaDeviceSynchronize();
return 0;
}
See it on Godbolt TODO