cub/thread/thread_reduce.cuh

File members: cub/thread/thread_reduce.cuh

/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2025, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cub/detail/array_utils.cuh> // to_array()
#include <cub/detail/type_traits.cuh> // are_same()
#include <cub/thread/thread_load.cuh> // UnrolledCopy
#include <cub/thread/thread_operators.cuh>
#include <cub/thread/thread_simd.cuh>
#include <cub/util_namespace.cuh>

#include <cuda/functional> // cuda::maximum
#include <cuda/std/array> // array
#include <cuda/std/cassert> // assert
#include <cuda/std/cstdint> // uint16_t
#include <cuda/std/functional> // cuda::std::plus
#include <cuda/std/iterator> // cuda::std::iter_value_t

#if _CCCL_HAS_NVFP16()
#  include <cuda_fp16.h>
#endif // _CCCL_HAS_NVFP16()

#if _CCCL_HAS_NVBF16()
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
#  include <cuda_bf16.h>
_CCCL_DIAG_POP
#endif // _CCCL_HAS_NVBF16()

CUB_NAMESPACE_BEGIN

template <typename Input,
          typename ReductionOp,
          typename ValueT = _CUDA_VSTD::iter_value_t<Input>,
          typename AccumT = _CUDA_VSTD::__accumulator_t<ReductionOp, ValueT>>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op);
// forward declaration

/***********************************************************************************************************************
 * Internal Reduction Implementations
 **********************************************************************************************************************/
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document

namespace detail
{

/***********************************************************************************************************************
 * Enable SIMD/Tree reduction heuristics (Trait)
 **********************************************************************************************************************/

// TODO: add Blackwell support

//----------------------------------------------------------------------------------------------------------------------
// SM90 SIMD

template <typename T, typename ReductionOp, int Length>
inline constexpr bool enable_sm90_simd_reduction_v =
  cub::detail::is_one_of_v<T, int16_t, uint16_t> && is_cuda_std_min_max_v<ReductionOp, T> && Length >= 10;

//----------------------------------------------------------------------------------------------------------------------
// SM80 SIMD

template <typename T, typename ReductionOp, int Length>
inline constexpr bool enable_sm80_simd_reduction_v = false;

#  if _CCCL_HAS_NVFP16()

template <typename ReductionOp, int Length>
inline constexpr bool enable_sm80_simd_reduction_v<__half, ReductionOp, Length> =
  (is_cuda_std_min_max_v<ReductionOp, __half> || is_cuda_std_plus_mul_v<ReductionOp, __half>) && Length >= 4;

#  endif // defined(_CCCL_HAS_NVFP16)

#  if _CCCL_HAS_NVBF16()

template <typename ReductionOp, int Length>
inline constexpr bool enable_sm80_simd_reduction_v<__nv_bfloat16, ReductionOp, Length> =
  (is_cuda_std_min_max_v<ReductionOp, __nv_bfloat16> || is_cuda_std_plus_mul_v<ReductionOp, __nv_bfloat16>)
  && Length >= 4;

#  endif // _CCCL_HAS_NVBF16()

//----------------------------------------------------------------------------------------------------------------------
// SM70 SIMD

#  if _CCCL_HAS_NVFP16()

template <typename T, typename ReductionOp, int Length>
inline constexpr bool enable_sm70_simd_reduction_v =
  _CUDA_VSTD::is_same_v<T, __half> && is_cuda_std_plus_mul_v<ReductionOp, T> && Length >= 4;

#  else // _CCCL_HAS_NVFP16() ^^^^ / !_CCCL_HAS_NVFP16() vvvv

template <typename T, typename ReductionOp, int Length>
inline constexpr bool enable_sm70_simd_reduction_v = false;

#  endif // !_CCCL_HAS_NVFP16() ^^^^

/***********************************************************************************************************************
 * Enable Ternary Reduction (Trait)
 **********************************************************************************************************************/

template <typename T, typename ReductionOp>
inline constexpr bool enable_ternary_reduction_sm90_v =
  cub::detail::is_one_of_v<T, int32_t, uint32_t> && is_cuda_std_min_max_v<ReductionOp, T>;

#  if _CCCL_HAS_NVFP16()

template <typename ReductionOp>
inline constexpr bool enable_ternary_reduction_sm90_v<__half2, ReductionOp> =
  is_cuda_std_min_max_v<ReductionOp, __half2>
  || cub::detail::is_one_of_v<ReductionOp, SimdMin<__half>, SimdMax<__half>>;

#  endif // _CCCL_HAS_NVFP16()

#  if _CCCL_HAS_NVBF16()

template <typename ReductionOp>
inline constexpr bool enable_ternary_reduction_sm90_v<__nv_bfloat162, ReductionOp> =
  is_cuda_std_min_max_v<ReductionOp, __nv_bfloat162>
  || cub::detail::is_one_of_v<ReductionOp, SimdMin<__nv_bfloat16>, SimdMax<__nv_bfloat16>>;

#  endif // _CCCL_HAS_NVBF16()

template <typename T, typename ReductionOp>
inline constexpr bool enable_ternary_reduction_sm50_v =
  _CUDA_VSTD::is_integral_v<T> && sizeof(T) <= 4
  && (cub::detail::is_one_of_v<ReductionOp, _CUDA_VSTD::plus<>, _CUDA_VSTD::plus<T>>
      || is_cuda_std_bitwise_v<ReductionOp, T>);

/***********************************************************************************************************************
 * Internal Reduction Algorithms: Sequential, Binary, Ternary
 **********************************************************************************************************************/

template <typename AccumT, typename Input, typename ReductionOp>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduceSequential(const Input& input, ReductionOp reduction_op)
{
  auto retval = static_cast<AccumT>(input[0]);
  _CCCL_PRAGMA_UNROLL_FULL()
  for (int i = 1; i < cub::detail::static_size_v<Input>; ++i)
  {
    retval = reduction_op(retval, input[i]);
  }
  return retval;
}

template <typename AccumT, typename Input, typename ReductionOp>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduceBinaryTree(const Input& input, ReductionOp reduction_op)
{
  constexpr auto length = cub::detail::static_size_v<Input>;
  auto array            = cub::detail::to_array<AccumT>(input);
  _CCCL_PRAGMA_UNROLL_FULL()
  for (int i = 1; i < length; i *= 2)
  {
    _CCCL_PRAGMA_UNROLL_FULL()
    for (int j = 0; j + i < length; j += i * 2)
    {
      array[j] = reduction_op(array[j], array[j + i]);
    }
  }
  return array[0];
}

template <typename AccumT, typename Input, typename ReductionOp>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduceTernaryTree(const Input& input, ReductionOp reduction_op)
{
  constexpr auto length = cub::detail::static_size_v<Input>;
  auto array            = cub::detail::to_array<AccumT>(input);
  _CCCL_PRAGMA_UNROLL_FULL()
  for (int i = 1; i < length; i *= 3)
  {
    _CCCL_PRAGMA_UNROLL_FULL()
    for (int j = 0; j + i < length; j += i * 3)
    {
      auto value = reduction_op(array[j], array[j + i]);
      array[j]   = (j + i * 2 < length) ? reduction_op(value, array[j + i * 2]) : value;
    }
  }
  return array[0];
}

/***********************************************************************************************************************
 * SIMD Reduction
 **********************************************************************************************************************/

// NOTE: bit_cast cannot be always used because __half, __nv_bfloat16, etc. are not trivially copyable
template <typename Output, typename Input>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE Output unsafe_bitcast(const Input& input)
{
  Output output;
  static_assert(sizeof(input) == sizeof(output), "wrong size");
  ::memcpy(&output, &input, sizeof(input));
  return output;
}

template <typename Input, typename ReductionOp>
_CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, ReductionOp)
{
  using cub::detail::unsafe_bitcast;
  using T                       = _CUDA_VSTD::iter_value_t<Input>;
  using SimdReduceOp            = cub_operator_to_simd_operator_t<ReductionOp, T>;
  using SimdType                = simd_type_t<T>;
  constexpr auto length         = cub::detail::static_size_v<Input>;
  constexpr auto simd_ratio     = sizeof(SimdType) / sizeof(T);
  constexpr auto length_rounded = ::cuda::round_down(length, simd_ratio);
  using UnpackedType            = _CUDA_VSTD::array<T, simd_ratio>;
  using SimdArray               = _CUDA_VSTD::array<SimdType, length / simd_ratio>;
  static_assert(simd_ratio == 2, "Only SIMD size == 2 is supported");
  T local_array[length_rounded];
  UnrolledCopy<length_rounded>(input, local_array);
  auto simd_input      = unsafe_bitcast<SimdArray>(local_array);
  auto simd_reduction  = cub::ThreadReduce(simd_input, SimdReduceOp{});
  auto unpacked_values = unsafe_bitcast<UnpackedType>(simd_reduction);
  // Create a reversed copy of the SIMD reduction result and apply the SIMD operator.
  // This avoids redundant instructions for converting to and from 32-bit registers
  T unpacked_values_rev[] = {unpacked_values[1], unpacked_values[0]};
  auto simd_reduction_rev = unsafe_bitcast<SimdType>(unpacked_values_rev);
  SimdType result         = SimdReduceOp{}(simd_reduction, simd_reduction_rev);
  // repeat the same optimization for the last element
  if constexpr (length % simd_ratio == 1)
  {
    T tail[]       = {input[length - 1], T{}};
    auto tail_simd = unsafe_bitcast<SimdType>(tail);
    result         = SimdReduceOp{}(result, tail_simd);
  }
  return unsafe_bitcast<UnpackedType>(result)[0];
}

template <typename ReductionOp, typename T>
inline constexpr bool enable_min_max_promotion_v =
  is_cuda_std_min_max_v<ReductionOp, T> && _CUDA_VSTD::is_integral_v<T> && sizeof(T) <= 2;

} // namespace detail

/***********************************************************************************************************************
 * Reduction Interface/Dispatch (public)
 **********************************************************************************************************************/

template <typename Input, typename ReductionOp, typename ValueT, typename AccumT>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op)
{
  static_assert(detail::is_fixed_size_random_access_range_v<Input>,
                "Input must support the subscript operator[] and have a compile-time size");
  static_assert(cub::detail::has_binary_call_operator<ReductionOp, ValueT>::value,
                "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
  if constexpr (cub::detail::static_size_v<Input> == 1)
  {
    return static_cast<AccumT>(input[0]);
  }
  using cub::detail::is_one_of_v;
  using namespace cub::detail;
  using PromT = _CUDA_VSTD::_If<enable_min_max_promotion_v<ReductionOp, ValueT>, int, AccumT>;
  // TODO: should be part of the tuning policy
  if constexpr ((!is_cuda_std_operator_v<ReductionOp, ValueT> && !is_simd_operator_v<ReductionOp>)
                || sizeof(ValueT) >= 8)
  {
    return ThreadReduceSequential<AccumT>(input, reduction_op);
  }

  constexpr auto length = cub::detail::static_size_v<Input>;
  if constexpr (_CUDA_VSTD::is_same_v<Input, AccumT> && enable_sm90_simd_reduction_v<Input, ReductionOp, length>)
  {
    NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSimd(input, reduction_op);))
  }

  if constexpr (_CUDA_VSTD::is_same_v<Input, AccumT> && enable_sm80_simd_reduction_v<Input, ReductionOp, length>)
  {
    NV_IF_TARGET(NV_PROVIDES_SM_80, (return ThreadReduceSimd(input, reduction_op);))
  }

  if constexpr (_CUDA_VSTD::is_same_v<Input, AccumT> && enable_sm70_simd_reduction_v<Input, ReductionOp, length>)
  {
    NV_IF_TARGET(NV_PROVIDES_SM_70, (return ThreadReduceSimd(input, reduction_op);))
  }

  if constexpr (enable_ternary_reduction_sm90_v<Input, ReductionOp>)
  {
    // with the current tuning policies, SM90/int32/+ uses too many registers (TODO: fix tuning policy)
    if constexpr ((is_one_of_v<ReductionOp, _CUDA_VSTD::plus<>, _CUDA_VSTD::plus<PromT>>
                   && is_one_of_v<PromT, int32_t, uint32_t>)
                  // the compiler generates bad code for int8/uint8 and min/max for SM90
                  || (is_cuda_std_min_max_v<ReductionOp, ValueT> && is_one_of_v<PromT, int8_t, uint8_t>) )
    {
      NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSequential<PromT>(input, reduction_op);));
    }
    NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceTernaryTree<PromT>(input, reduction_op);));
  }

  if constexpr (enable_ternary_reduction_sm50_v<Input, ReductionOp>)
  {
    NV_IF_TARGET(NV_PROVIDES_SM_50, (return ThreadReduceSequential<PromT>(input, reduction_op);));
  }

  return ThreadReduceBinaryTree<PromT>(input, reduction_op);
}

template <typename Input,
          typename ReductionOp,
          typename PrefixT,
          typename ValueT = _CUDA_VSTD::iter_value_t<Input>,
          typename AccumT = _CUDA_VSTD::__accumulator_t<ReductionOp, ValueT, PrefixT>>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
ThreadReduce(const Input& input, ReductionOp reduction_op, PrefixT prefix)
{
  static_assert(detail::is_fixed_size_random_access_range_v<Input>,
                "Input must support the subscript operator[] and have a compile-time size");
  static_assert(detail::has_binary_call_operator<ReductionOp, ValueT>::value,
                "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
  constexpr int length = cub::detail::static_size_v<Input>;
  // copy to a temporary array of type AccumT
  AccumT array[length + 1];
  array[0] = prefix;

  _CCCL_PRAGMA_UNROLL_FULL()
  for (int i = 0; i < length; ++i)
  {
    array[i + 1] = input[i];
  }
  return cub::ThreadReduce<decltype(array), ReductionOp, AccumT, AccumT>(array, reduction_op);
}

#endif // !_CCCL_DOXYGEN_INVOKED

CUB_NAMESPACE_END