cub/thread/thread_reduce.cuh
File members: cub/thread/thread_reduce.cuh
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#pragma once
#include <cub/config.cuh>
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header
#include <cub/detail/array_utils.cuh> // to_array()
#include <cub/detail/type_traits.cuh> // are_same()
#include <cub/thread/thread_load.cuh> // UnrolledCopy
#include <cub/thread/thread_operators.cuh>
#include <cub/thread/thread_simd.cuh>
#include <cub/util_namespace.cuh>
#include <cuda/functional> // cuda::maximum
#include <cuda/std/array> // array
#include <cuda/std/cassert> // assert
#include <cuda/std/cstdint> // uint16_t
#include <cuda/std/functional> // cuda::std::plus
#include <cuda/std/iterator> // cuda::std::iter_value_t
#if _CCCL_HAS_NVFP16()
# include <cuda_fp16.h>
#endif // _CCCL_HAS_NVFP16()
#if _CCCL_HAS_NVBF16()
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
# include <cuda_bf16.h>
_CCCL_DIAG_POP
#endif // _CCCL_HAS_NVBF16()
CUB_NAMESPACE_BEGIN
template <typename Input,
typename ReductionOp,
typename ValueT = _CUDA_VSTD::iter_value_t<Input>,
typename AccumT = _CUDA_VSTD::__accumulator_t<ReductionOp, ValueT>>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op);
// forward declaration
/***********************************************************************************************************************
* Internal Reduction Implementations
**********************************************************************************************************************/
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
namespace detail
{
/***********************************************************************************************************************
* Enable SIMD/Tree reduction heuristics (Trait)
**********************************************************************************************************************/
// TODO: add Blackwell support
//----------------------------------------------------------------------------------------------------------------------
// SM90 SIMD
template <typename T, typename ReductionOp, int Length>
inline constexpr bool enable_sm90_simd_reduction_v =
cub::detail::is_one_of_v<T, int16_t, uint16_t> && is_cuda_std_min_max_v<ReductionOp, T> && Length >= 10;
//----------------------------------------------------------------------------------------------------------------------
// SM80 SIMD
template <typename T, typename ReductionOp, int Length>
inline constexpr bool enable_sm80_simd_reduction_v = false;
# if _CCCL_HAS_NVFP16()
template <typename ReductionOp, int Length>
inline constexpr bool enable_sm80_simd_reduction_v<__half, ReductionOp, Length> =
(is_cuda_std_min_max_v<ReductionOp, __half> || is_cuda_std_plus_mul_v<ReductionOp, __half>) && Length >= 4;
# endif // defined(_CCCL_HAS_NVFP16)
# if _CCCL_HAS_NVBF16()
template <typename ReductionOp, int Length>
inline constexpr bool enable_sm80_simd_reduction_v<__nv_bfloat16, ReductionOp, Length> =
(is_cuda_std_min_max_v<ReductionOp, __nv_bfloat16> || is_cuda_std_plus_mul_v<ReductionOp, __nv_bfloat16>)
&& Length >= 4;
# endif // _CCCL_HAS_NVBF16()
//----------------------------------------------------------------------------------------------------------------------
// SM70 SIMD
# if _CCCL_HAS_NVFP16()
template <typename T, typename ReductionOp, int Length>
inline constexpr bool enable_sm70_simd_reduction_v =
_CUDA_VSTD::is_same_v<T, __half> && is_cuda_std_plus_mul_v<ReductionOp, T> && Length >= 4;
# else // _CCCL_HAS_NVFP16() ^^^^ / !_CCCL_HAS_NVFP16() vvvv
template <typename T, typename ReductionOp, int Length>
inline constexpr bool enable_sm70_simd_reduction_v = false;
# endif // !_CCCL_HAS_NVFP16() ^^^^
/***********************************************************************************************************************
* Enable Ternary Reduction (Trait)
**********************************************************************************************************************/
template <typename T, typename ReductionOp>
inline constexpr bool enable_ternary_reduction_sm90_v =
cub::detail::is_one_of_v<T, int32_t, uint32_t> && is_cuda_std_min_max_v<ReductionOp, T>;
# if _CCCL_HAS_NVFP16()
template <typename ReductionOp>
inline constexpr bool enable_ternary_reduction_sm90_v<__half2, ReductionOp> =
is_cuda_std_min_max_v<ReductionOp, __half2>
|| cub::detail::is_one_of_v<ReductionOp, SimdMin<__half>, SimdMax<__half>>;
# endif // _CCCL_HAS_NVFP16()
# if _CCCL_HAS_NVBF16()
template <typename ReductionOp>
inline constexpr bool enable_ternary_reduction_sm90_v<__nv_bfloat162, ReductionOp> =
is_cuda_std_min_max_v<ReductionOp, __nv_bfloat162>
|| cub::detail::is_one_of_v<ReductionOp, SimdMin<__nv_bfloat16>, SimdMax<__nv_bfloat16>>;
# endif // _CCCL_HAS_NVBF16()
template <typename T, typename ReductionOp>
inline constexpr bool enable_ternary_reduction_sm50_v =
_CUDA_VSTD::is_integral_v<T> && sizeof(T) <= 4
&& (cub::detail::is_one_of_v<ReductionOp, _CUDA_VSTD::plus<>, _CUDA_VSTD::plus<T>>
|| is_cuda_std_bitwise_v<ReductionOp, T>);
/***********************************************************************************************************************
* Internal Reduction Algorithms: Sequential, Binary, Ternary
**********************************************************************************************************************/
template <typename AccumT, typename Input, typename ReductionOp>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduceSequential(const Input& input, ReductionOp reduction_op)
{
auto retval = static_cast<AccumT>(input[0]);
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 1; i < cub::detail::static_size_v<Input>; ++i)
{
retval = reduction_op(retval, input[i]);
}
return retval;
}
template <typename AccumT, typename Input, typename ReductionOp>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduceBinaryTree(const Input& input, ReductionOp reduction_op)
{
constexpr auto length = cub::detail::static_size_v<Input>;
auto array = cub::detail::to_array<AccumT>(input);
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 1; i < length; i *= 2)
{
_CCCL_PRAGMA_UNROLL_FULL()
for (int j = 0; j + i < length; j += i * 2)
{
array[j] = reduction_op(array[j], array[j + i]);
}
}
return array[0];
}
template <typename AccumT, typename Input, typename ReductionOp>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduceTernaryTree(const Input& input, ReductionOp reduction_op)
{
constexpr auto length = cub::detail::static_size_v<Input>;
auto array = cub::detail::to_array<AccumT>(input);
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 1; i < length; i *= 3)
{
_CCCL_PRAGMA_UNROLL_FULL()
for (int j = 0; j + i < length; j += i * 3)
{
auto value = reduction_op(array[j], array[j + i]);
array[j] = (j + i * 2 < length) ? reduction_op(value, array[j + i * 2]) : value;
}
}
return array[0];
}
/***********************************************************************************************************************
* SIMD Reduction
**********************************************************************************************************************/
// NOTE: bit_cast cannot be always used because __half, __nv_bfloat16, etc. are not trivially copyable
template <typename Output, typename Input>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE Output unsafe_bitcast(const Input& input)
{
Output output;
static_assert(sizeof(input) == sizeof(output), "wrong size");
::memcpy(&output, &input, sizeof(input));
return output;
}
template <typename Input, typename ReductionOp>
_CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, ReductionOp)
{
using cub::detail::unsafe_bitcast;
using T = _CUDA_VSTD::iter_value_t<Input>;
using SimdReduceOp = cub_operator_to_simd_operator_t<ReductionOp, T>;
using SimdType = simd_type_t<T>;
constexpr auto length = cub::detail::static_size_v<Input>;
constexpr auto simd_ratio = sizeof(SimdType) / sizeof(T);
constexpr auto length_rounded = ::cuda::round_down(length, simd_ratio);
using UnpackedType = _CUDA_VSTD::array<T, simd_ratio>;
using SimdArray = _CUDA_VSTD::array<SimdType, length / simd_ratio>;
static_assert(simd_ratio == 2, "Only SIMD size == 2 is supported");
T local_array[length_rounded];
UnrolledCopy<length_rounded>(input, local_array);
auto simd_input = unsafe_bitcast<SimdArray>(local_array);
auto simd_reduction = cub::ThreadReduce(simd_input, SimdReduceOp{});
auto unpacked_values = unsafe_bitcast<UnpackedType>(simd_reduction);
// Create a reversed copy of the SIMD reduction result and apply the SIMD operator.
// This avoids redundant instructions for converting to and from 32-bit registers
T unpacked_values_rev[] = {unpacked_values[1], unpacked_values[0]};
auto simd_reduction_rev = unsafe_bitcast<SimdType>(unpacked_values_rev);
SimdType result = SimdReduceOp{}(simd_reduction, simd_reduction_rev);
// repeat the same optimization for the last element
if constexpr (length % simd_ratio == 1)
{
T tail[] = {input[length - 1], T{}};
auto tail_simd = unsafe_bitcast<SimdType>(tail);
result = SimdReduceOp{}(result, tail_simd);
}
return unsafe_bitcast<UnpackedType>(result)[0];
}
template <typename ReductionOp, typename T>
inline constexpr bool enable_min_max_promotion_v =
is_cuda_std_min_max_v<ReductionOp, T> && _CUDA_VSTD::is_integral_v<T> && sizeof(T) <= 2;
} // namespace detail
/***********************************************************************************************************************
* Reduction Interface/Dispatch (public)
**********************************************************************************************************************/
template <typename Input, typename ReductionOp, typename ValueT, typename AccumT>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op)
{
static_assert(detail::is_fixed_size_random_access_range_v<Input>,
"Input must support the subscript operator[] and have a compile-time size");
static_assert(cub::detail::has_binary_call_operator<ReductionOp, ValueT>::value,
"ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
if constexpr (cub::detail::static_size_v<Input> == 1)
{
return static_cast<AccumT>(input[0]);
}
using cub::detail::is_one_of_v;
using namespace cub::detail;
using PromT = _CUDA_VSTD::_If<enable_min_max_promotion_v<ReductionOp, ValueT>, int, AccumT>;
// TODO: should be part of the tuning policy
if constexpr ((!is_cuda_std_operator_v<ReductionOp, ValueT> && !is_simd_operator_v<ReductionOp>)
|| sizeof(ValueT) >= 8)
{
return ThreadReduceSequential<AccumT>(input, reduction_op);
}
constexpr auto length = cub::detail::static_size_v<Input>;
if constexpr (_CUDA_VSTD::is_same_v<Input, AccumT> && enable_sm90_simd_reduction_v<Input, ReductionOp, length>)
{
NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSimd(input, reduction_op);))
}
if constexpr (_CUDA_VSTD::is_same_v<Input, AccumT> && enable_sm80_simd_reduction_v<Input, ReductionOp, length>)
{
NV_IF_TARGET(NV_PROVIDES_SM_80, (return ThreadReduceSimd(input, reduction_op);))
}
if constexpr (_CUDA_VSTD::is_same_v<Input, AccumT> && enable_sm70_simd_reduction_v<Input, ReductionOp, length>)
{
NV_IF_TARGET(NV_PROVIDES_SM_70, (return ThreadReduceSimd(input, reduction_op);))
}
if constexpr (enable_ternary_reduction_sm90_v<Input, ReductionOp>)
{
// with the current tuning policies, SM90/int32/+ uses too many registers (TODO: fix tuning policy)
if constexpr ((is_one_of_v<ReductionOp, _CUDA_VSTD::plus<>, _CUDA_VSTD::plus<PromT>>
&& is_one_of_v<PromT, int32_t, uint32_t>)
// the compiler generates bad code for int8/uint8 and min/max for SM90
|| (is_cuda_std_min_max_v<ReductionOp, ValueT> && is_one_of_v<PromT, int8_t, uint8_t>) )
{
NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSequential<PromT>(input, reduction_op);));
}
NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceTernaryTree<PromT>(input, reduction_op);));
}
if constexpr (enable_ternary_reduction_sm50_v<Input, ReductionOp>)
{
NV_IF_TARGET(NV_PROVIDES_SM_50, (return ThreadReduceSequential<PromT>(input, reduction_op);));
}
return ThreadReduceBinaryTree<PromT>(input, reduction_op);
}
template <typename Input,
typename ReductionOp,
typename PrefixT,
typename ValueT = _CUDA_VSTD::iter_value_t<Input>,
typename AccumT = _CUDA_VSTD::__accumulator_t<ReductionOp, ValueT, PrefixT>>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
ThreadReduce(const Input& input, ReductionOp reduction_op, PrefixT prefix)
{
static_assert(detail::is_fixed_size_random_access_range_v<Input>,
"Input must support the subscript operator[] and have a compile-time size");
static_assert(detail::has_binary_call_operator<ReductionOp, ValueT>::value,
"ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
constexpr int length = cub::detail::static_size_v<Input>;
// copy to a temporary array of type AccumT
AccumT array[length + 1];
array[0] = prefix;
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 0; i < length; ++i)
{
array[i + 1] = input[i];
}
return cub::ThreadReduce<decltype(array), ReductionOp, AccumT, AccumT>(array, reduction_op);
}
#endif // !_CCCL_DOXYGEN_INVOKED
CUB_NAMESPACE_END