cub/device/device_scan.cuh

File members: cub/device/device_scan.cuh

/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cub/detail/choose_offset.cuh>
#include <cub/detail/nvtx.cuh>
#include <cub/device/dispatch/dispatch_scan.cuh>
#include <cub/device/dispatch/dispatch_scan_by_key.cuh>
#include <cub/thread/thread_operators.cuh>
#include <cub/util_deprecated.cuh>

#include <cuda/std/__functional/invoke.h>

CUB_NAMESPACE_BEGIN

struct DeviceScan
{

  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    InputIteratorT d_in,
    OutputIteratorT d_out,
    NumItemsT num_items,
    cudaStream_t stream = 0)
  {
    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum");

    // Unsigned integer type for global offsets
    using OffsetT = detail::choose_offset_t<NumItemsT>;
    using InitT   = cub::detail::value_t<InputIteratorT>;

    // Initial value
    InitT init_value{};

    return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, detail::InputValue<InitT>, OffsetT>::
      Dispatch(d_temp_storage,
               temp_storage_bytes,
               d_in,
               d_out,
               ::cuda::std::plus<>{},
               detail::InputValue<InitT>(init_value),
               num_items,
               stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    InputIteratorT d_in,
    OutputIteratorT d_out,
    NumItemsT num_items,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return ExclusiveSum<InputIteratorT, OutputIteratorT>(
      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename IteratorT, typename NumItemsT>
  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
    void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
  {
    return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename IteratorT, typename NumItemsT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    IteratorT d_data,
    NumItemsT num_items,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return ExclusiveSum<IteratorT>(d_temp_storage, temp_storage_bytes, d_data, num_items, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    InputIteratorT d_in,
    OutputIteratorT d_out,
    ScanOpT scan_op,
    InitValueT init_value,
    NumItemsT num_items,
    cudaStream_t stream = 0)
  {
    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");

    // Unsigned integer type for global offsets
    using OffsetT = detail::choose_offset_t<NumItemsT>;

    return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
      d_temp_storage,
      temp_storage_bytes,
      d_in,
      d_out,
      scan_op,
      detail::InputValue<InitValueT>(init_value),
      num_items,
      stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    InputIteratorT d_in,
    OutputIteratorT d_out,
    ScanOpT scan_op,
    InitValueT init_value,
    NumItemsT num_items,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return ExclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT>(
      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    IteratorT d_data,
    ScanOpT scan_op,
    InitValueT init_value,
    NumItemsT num_items,
    cudaStream_t stream = 0)
  {
    return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    IteratorT d_data,
    ScanOpT scan_op,
    InitValueT init_value,
    NumItemsT num_items,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return ExclusiveScan<IteratorT, ScanOpT, InitValueT>(
      d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename InputIteratorT,
            typename OutputIteratorT,
            typename ScanOpT,
            typename InitValueT,
            typename InitValueIterT = InitValueT*,
            typename NumItemsT      = int>
  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    InputIteratorT d_in,
    OutputIteratorT d_out,
    ScanOpT scan_op,
    FutureValue<InitValueT, InitValueIterT> init_value,
    NumItemsT num_items,
    cudaStream_t stream = 0)
  {
    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");

    // Unsigned integer type for global offsets
    using OffsetT = detail::choose_offset_t<NumItemsT>;

    return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
      d_temp_storage,
      temp_storage_bytes,
      d_in,
      d_out,
      scan_op,
      detail::InputValue<InitValueT>(init_value),
      num_items,
      stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename InputIteratorT,
            typename OutputIteratorT,
            typename ScanOpT,
            typename InitValueT,
            typename InitValueIterT = InitValueT*,
            typename NumItemsT      = int>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    InputIteratorT d_in,
    OutputIteratorT d_out,
    ScanOpT scan_op,
    FutureValue<InitValueT, InitValueIterT> init_value,
    NumItemsT num_items,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return ExclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, InitValueIterT>(
      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename IteratorT,
            typename ScanOpT,
            typename InitValueT,
            typename InitValueIterT = InitValueT*,
            typename NumItemsT      = int>
  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    IteratorT d_data,
    ScanOpT scan_op,
    FutureValue<InitValueT, InitValueIterT> init_value,
    NumItemsT num_items,
    cudaStream_t stream = 0)
  {
    return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename IteratorT,
            typename ScanOpT,
            typename InitValueT,
            typename InitValueIterT = InitValueT*,
            typename NumItemsT      = int>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    IteratorT d_data,
    ScanOpT scan_op,
    FutureValue<InitValueT, InitValueIterT> init_value,
    NumItemsT num_items,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return ExclusiveScan<IteratorT, ScanOpT, InitValueT, InitValueIterT>(
      d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    InputIteratorT d_in,
    OutputIteratorT d_out,
    NumItemsT num_items,
    cudaStream_t stream = 0)
  {
    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum");

    // Unsigned integer type for global offsets
    using OffsetT = detail::choose_offset_t<NumItemsT>;

    return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, NullType, OffsetT>::Dispatch(
      d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    InputIteratorT d_in,
    OutputIteratorT d_out,
    NumItemsT num_items,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return InclusiveSum<InputIteratorT, OutputIteratorT>(
      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename IteratorT, typename NumItemsT>
  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
    void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
  {
    return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename IteratorT, typename NumItemsT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    IteratorT d_data,
    NumItemsT num_items,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return InclusiveSum<IteratorT>(d_temp_storage, temp_storage_bytes, d_data, num_items, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    InputIteratorT d_in,
    OutputIteratorT d_out,
    ScanOpT scan_op,
    NumItemsT num_items,
    cudaStream_t stream = 0)
  {
    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan");

    // Unsigned integer type for global offsets
    using OffsetT = detail::choose_offset_t<NumItemsT>;

    return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
  }

  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    InputIteratorT d_in,
    OutputIteratorT d_out,
    ScanOpT scan_op,
    InitValueT init_value,
    NumItemsT num_items,
    cudaStream_t stream = 0)
  {
    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit");

    // Unsigned integer type for global offsets
    using OffsetT = detail::choose_offset_t<NumItemsT>;
    using AccumT  = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::value_t<InputIteratorT>, InitValueT>;
    constexpr bool ForceInclusive = true;

    return DispatchScan<
      InputIteratorT,
      OutputIteratorT,
      ScanOpT,
      detail::InputValue<InitValueT>,
      OffsetT,
      AccumT,
      detail::scan::policy_hub<AccumT, ScanOpT>,
      ForceInclusive>::Dispatch(d_temp_storage,
                                temp_storage_bytes,
                                d_in,
                                d_out,
                                scan_op,
                                detail::InputValue<InitValueT>(init_value),
                                num_items,
                                stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    InputIteratorT d_in,
    OutputIteratorT d_out,
    ScanOpT scan_op,
    NumItemsT num_items,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return InclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT>(
      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename IteratorT, typename ScanOpT, typename NumItemsT>
  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    IteratorT d_data,
    ScanOpT scan_op,
    NumItemsT num_items,
    cudaStream_t stream = 0)
  {
    return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename IteratorT, typename ScanOpT, typename NumItemsT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    IteratorT d_data,
    ScanOpT scan_op,
    NumItemsT num_items,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return InclusiveScan<IteratorT, ScanOpT>(d_temp_storage, temp_storage_bytes, d_data, scan_op, num_items, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename KeysInputIteratorT,
            typename ValuesInputIteratorT,
            typename ValuesOutputIteratorT,
            typename EqualityOpT = ::cuda::std::equal_to<>,
            typename NumItemsT   = std::uint32_t>
  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    KeysInputIteratorT d_keys_in,
    ValuesInputIteratorT d_values_in,
    ValuesOutputIteratorT d_values_out,
    NumItemsT num_items,
    EqualityOpT equality_op = EqualityOpT(),
    cudaStream_t stream     = 0)
  {
    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey");

    // Unsigned integer type for global offsets
    using OffsetT = detail::choose_offset_t<NumItemsT>;
    using InitT   = cub::detail::value_t<ValuesInputIteratorT>;

    // Initial value
    InitT init_value{};

    return DispatchScanByKey<
      KeysInputIteratorT,
      ValuesInputIteratorT,
      ValuesOutputIteratorT,
      EqualityOpT,
      ::cuda::std::plus<>,
      InitT,
      OffsetT>::Dispatch(d_temp_storage,
                         temp_storage_bytes,
                         d_keys_in,
                         d_values_in,
                         d_values_out,
                         equality_op,
                         ::cuda::std::plus<>{},
                         init_value,
                         num_items,
                         stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename KeysInputIteratorT,
            typename ValuesInputIteratorT,
            typename ValuesOutputIteratorT,
            typename EqualityOpT = ::cuda::std::equal_to<>,
            typename NumItemsT   = std::uint32_t>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    KeysInputIteratorT d_keys_in,
    ValuesInputIteratorT d_values_in,
    ValuesOutputIteratorT d_values_out,
    NumItemsT num_items,
    EqualityOpT equality_op,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return ExclusiveSumByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT>(
      d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename KeysInputIteratorT,
            typename ValuesInputIteratorT,
            typename ValuesOutputIteratorT,
            typename ScanOpT,
            typename InitValueT,
            typename EqualityOpT = ::cuda::std::equal_to<>,
            typename NumItemsT   = std::uint32_t>
  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    KeysInputIteratorT d_keys_in,
    ValuesInputIteratorT d_values_in,
    ValuesOutputIteratorT d_values_out,
    ScanOpT scan_op,
    InitValueT init_value,
    NumItemsT num_items,
    EqualityOpT equality_op = EqualityOpT(),
    cudaStream_t stream     = 0)
  {
    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey");

    // Unsigned integer type for global offsets
    using OffsetT = detail::choose_offset_t<NumItemsT>;

    return DispatchScanByKey<
      KeysInputIteratorT,
      ValuesInputIteratorT,
      ValuesOutputIteratorT,
      EqualityOpT,
      ScanOpT,
      InitValueT,
      OffsetT>::Dispatch(d_temp_storage,
                         temp_storage_bytes,
                         d_keys_in,
                         d_values_in,
                         d_values_out,
                         equality_op,
                         scan_op,
                         init_value,
                         num_items,
                         stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename KeysInputIteratorT,
            typename ValuesInputIteratorT,
            typename ValuesOutputIteratorT,
            typename ScanOpT,
            typename InitValueT,
            typename EqualityOpT = ::cuda::std::equal_to<>,
            typename NumItemsT   = std::uint32_t>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    KeysInputIteratorT d_keys_in,
    ValuesInputIteratorT d_values_in,
    ValuesOutputIteratorT d_values_out,
    ScanOpT scan_op,
    InitValueT init_value,
    NumItemsT num_items,
    EqualityOpT equality_op,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return ExclusiveScanByKey<KeysInputIteratorT,
                              ValuesInputIteratorT,
                              ValuesOutputIteratorT,
                              ScanOpT,
                              InitValueT,
                              EqualityOpT>(
      d_temp_storage,
      temp_storage_bytes,
      d_keys_in,
      d_values_in,
      d_values_out,
      scan_op,
      init_value,
      num_items,
      equality_op,
      stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename KeysInputIteratorT,
            typename ValuesInputIteratorT,
            typename ValuesOutputIteratorT,
            typename EqualityOpT = ::cuda::std::equal_to<>,
            typename NumItemsT   = std::uint32_t>
  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    KeysInputIteratorT d_keys_in,
    ValuesInputIteratorT d_values_in,
    ValuesOutputIteratorT d_values_out,
    NumItemsT num_items,
    EqualityOpT equality_op = EqualityOpT(),
    cudaStream_t stream     = 0)
  {
    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey");

    // Unsigned integer type for global offsets
    using OffsetT = detail::choose_offset_t<NumItemsT>;

    return DispatchScanByKey<
      KeysInputIteratorT,
      ValuesInputIteratorT,
      ValuesOutputIteratorT,
      EqualityOpT,
      ::cuda::std::plus<>,
      NullType,
      OffsetT>::Dispatch(d_temp_storage,
                         temp_storage_bytes,
                         d_keys_in,
                         d_values_in,
                         d_values_out,
                         equality_op,
                         ::cuda::std::plus<>{},
                         NullType{},
                         num_items,
                         stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename KeysInputIteratorT,
            typename ValuesInputIteratorT,
            typename ValuesOutputIteratorT,
            typename EqualityOpT = ::cuda::std::equal_to<>,
            typename NumItemsT   = std::uint32_t>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    KeysInputIteratorT d_keys_in,
    ValuesInputIteratorT d_values_in,
    ValuesOutputIteratorT d_values_out,
    NumItemsT num_items,
    EqualityOpT equality_op,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return InclusiveSumByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT>(
      d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

  template <typename KeysInputIteratorT,
            typename ValuesInputIteratorT,
            typename ValuesOutputIteratorT,
            typename ScanOpT,
            typename EqualityOpT = ::cuda::std::equal_to<>,
            typename NumItemsT   = std::uint32_t>
  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    KeysInputIteratorT d_keys_in,
    ValuesInputIteratorT d_values_in,
    ValuesOutputIteratorT d_values_out,
    ScanOpT scan_op,
    NumItemsT num_items,
    EqualityOpT equality_op = EqualityOpT(),
    cudaStream_t stream     = 0)
  {
    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey");

    // Unsigned integer type for global offsets
    using OffsetT = detail::choose_offset_t<NumItemsT>;

    return DispatchScanByKey<
      KeysInputIteratorT,
      ValuesInputIteratorT,
      ValuesOutputIteratorT,
      EqualityOpT,
      ScanOpT,
      NullType,
      OffsetT>::Dispatch(d_temp_storage,
                         temp_storage_bytes,
                         d_keys_in,
                         d_values_in,
                         d_values_out,
                         equality_op,
                         scan_op,
                         NullType(),
                         num_items,
                         stream);
  }

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  template <typename KeysInputIteratorT,
            typename ValuesInputIteratorT,
            typename ValuesOutputIteratorT,
            typename ScanOpT,
            typename EqualityOpT = ::cuda::std::equal_to<>,
            typename NumItemsT   = std::uint32_t>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    KeysInputIteratorT d_keys_in,
    ValuesInputIteratorT d_values_in,
    ValuesOutputIteratorT d_values_out,
    ScanOpT scan_op,
    NumItemsT num_items,
    EqualityOpT equality_op,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return InclusiveScanByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, ScanOpT, EqualityOpT>(
      d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op, stream);
  }
#endif // _CCCL_DOXYGEN_INVOKED

};

CUB_NAMESPACE_END