ceil_div
Ceiling division
template <typename T, typename U>
[[nodiscard]] __host__ __device__ inline
constexpr cuda::std::common_type_t<T, U> ceil_div(T value, U divisor) noexcept;
The function computes the ceiling division between two integral or enumerator values \(ceil(\frac{value}{base\_multiple})\).
Parameters
value
: The value to be divided.divisor
: The divisor.
Return value
Divides value
by divisor
. If value
is not a multiple of divisor
rounds the result up to the next integer value.
Preconditions
Compile-time:
T
andU
are integral types (including 128-bit integers) or enumerators.Run-time:
value >= 0
anddivisor > 0
.
Performance considerations
The function computes
(value + divisor - 1) / divisor
when the common type is a signed integer.The function computes
min(value, 1 + ((value - 1) / divisor)
when the common type is an unsigned integer in CUDA, which generates less instructions than(value / divisor) + ((value / divisor) * divisor != value)
, especially for 64-bit types.
Example
This API is very useful for determining the number of thread blocks required to process a fixed amount of work, given a fixed number of threads per block:
#include <cuda/cmath>
#include <cuda/std/span>
#include <thrust/device_vector.h>
__global__ void vector_scale_kernel(cuda::std::span<float> span, float scale) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < span.size())
span[index] *= scale;
}
int main() {
int num_items = 100'000;
float scale = 2.f;
thrust::device_vector<float> d_vector(num_items, 1.f);
// Given a fixed number of threads per block...
constexpr int threads_per_block = 256;
// ...dividing some "n" by "threads_per_block" may lead to a remainder,
// requiring the kernel to be launched with an extra thread block to handle it.
auto num_thread_blocks = cuda::ceil_div(num_items, threads_per_block);
auto d_ptr = thrust::raw_pointer_cast(d_vector.data());
cuda::std::span<float> d_span(d_ptr, num_items);
vector_scale_kernel<<<num_thread_blocks, threads_per_block>>>(d_span, scale);
cudaDeviceSynchronize();
return 0;
}