cub/device/device_scan.cuh
File members: cub/device/device_scan.cuh
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#pragma once
#include <cub/config.cuh>
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header
#include <cub/detail/choose_offset.cuh>
#include <cub/detail/nvtx.cuh>
#include <cub/device/dispatch/dispatch_scan.cuh>
#include <cub/device/dispatch/dispatch_scan_by_key.cuh>
#include <cub/thread/thread_operators.cuh>
#include <cuda/std/__functional/invoke.h>
CUB_NAMESPACE_BEGIN
struct DeviceScan
{
template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
NumItemsT num_items,
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum");
// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
using InitT = cub::detail::value_t<InputIteratorT>;
// Initial value
InitT init_value{};
return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, detail::InputValue<InitT>, OffsetT>::
Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
::cuda::std::plus<>{},
detail::InputValue<InitT>(init_value),
num_items,
stream);
}
template <typename IteratorT, typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
{
return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
}
template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
InitValueT init_value,
NumItemsT num_items,
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
detail::InputValue<InitValueT>(init_value),
num_items,
stream);
}
template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
void* d_temp_storage,
size_t& temp_storage_bytes,
IteratorT d_data,
ScanOpT scan_op,
InitValueT init_value,
NumItemsT num_items,
cudaStream_t stream = 0)
{
return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
}
template <typename InputIteratorT,
typename OutputIteratorT,
typename ScanOpT,
typename InitValueT,
typename InitValueIterT = InitValueT*,
typename NumItemsT = int>
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
FutureValue<InitValueT, InitValueIterT> init_value,
NumItemsT num_items,
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
detail::InputValue<InitValueT>(init_value),
num_items,
stream);
}
template <typename IteratorT,
typename ScanOpT,
typename InitValueT,
typename InitValueIterT = InitValueT*,
typename NumItemsT = int>
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
void* d_temp_storage,
size_t& temp_storage_bytes,
IteratorT d_data,
ScanOpT scan_op,
FutureValue<InitValueT, InitValueIterT> init_value,
NumItemsT num_items,
cudaStream_t stream = 0)
{
return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
}
template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
NumItemsT num_items,
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum");
// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, NullType, OffsetT>::Dispatch(
d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
}
template <typename IteratorT, typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
{
return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
}
template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
NumItemsT num_items,
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan");
// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
}
template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit(
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
InitValueT init_value,
NumItemsT num_items,
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit");
// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
using AccumT = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::value_t<InputIteratorT>, InitValueT>;
constexpr bool ForceInclusive = true;
return DispatchScan<
InputIteratorT,
OutputIteratorT,
ScanOpT,
detail::InputValue<InitValueT>,
OffsetT,
AccumT,
detail::scan::policy_hub<AccumT, ScanOpT>,
ForceInclusive>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
detail::InputValue<InitValueT>(init_value),
num_items,
stream);
}
template <typename IteratorT, typename ScanOpT, typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
void* d_temp_storage,
size_t& temp_storage_bytes,
IteratorT d_data,
ScanOpT scan_op,
NumItemsT num_items,
cudaStream_t stream = 0)
{
return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
}
template <typename KeysInputIteratorT,
typename ValuesInputIteratorT,
typename ValuesOutputIteratorT,
typename EqualityOpT = ::cuda::std::equal_to<>,
typename NumItemsT = std::uint32_t>
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
void* d_temp_storage,
size_t& temp_storage_bytes,
KeysInputIteratorT d_keys_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
NumItemsT num_items,
EqualityOpT equality_op = EqualityOpT(),
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey");
// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
using InitT = cub::detail::value_t<ValuesInputIteratorT>;
// Initial value
InitT init_value{};
return DispatchScanByKey<
KeysInputIteratorT,
ValuesInputIteratorT,
ValuesOutputIteratorT,
EqualityOpT,
::cuda::std::plus<>,
InitT,
OffsetT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_values_out,
equality_op,
::cuda::std::plus<>{},
init_value,
num_items,
stream);
}
template <typename KeysInputIteratorT,
typename ValuesInputIteratorT,
typename ValuesOutputIteratorT,
typename ScanOpT,
typename InitValueT,
typename EqualityOpT = ::cuda::std::equal_to<>,
typename NumItemsT = std::uint32_t>
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
void* d_temp_storage,
size_t& temp_storage_bytes,
KeysInputIteratorT d_keys_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
ScanOpT scan_op,
InitValueT init_value,
NumItemsT num_items,
EqualityOpT equality_op = EqualityOpT(),
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey");
// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
return DispatchScanByKey<
KeysInputIteratorT,
ValuesInputIteratorT,
ValuesOutputIteratorT,
EqualityOpT,
ScanOpT,
InitValueT,
OffsetT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_values_out,
equality_op,
scan_op,
init_value,
num_items,
stream);
}
template <typename KeysInputIteratorT,
typename ValuesInputIteratorT,
typename ValuesOutputIteratorT,
typename EqualityOpT = ::cuda::std::equal_to<>,
typename NumItemsT = std::uint32_t>
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
void* d_temp_storage,
size_t& temp_storage_bytes,
KeysInputIteratorT d_keys_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
NumItemsT num_items,
EqualityOpT equality_op = EqualityOpT(),
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey");
// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
return DispatchScanByKey<
KeysInputIteratorT,
ValuesInputIteratorT,
ValuesOutputIteratorT,
EqualityOpT,
::cuda::std::plus<>,
NullType,
OffsetT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_values_out,
equality_op,
::cuda::std::plus<>{},
NullType{},
num_items,
stream);
}
template <typename KeysInputIteratorT,
typename ValuesInputIteratorT,
typename ValuesOutputIteratorT,
typename ScanOpT,
typename EqualityOpT = ::cuda::std::equal_to<>,
typename NumItemsT = std::uint32_t>
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
void* d_temp_storage,
size_t& temp_storage_bytes,
KeysInputIteratorT d_keys_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
ScanOpT scan_op,
NumItemsT num_items,
EqualityOpT equality_op = EqualityOpT(),
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey");
// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
return DispatchScanByKey<
KeysInputIteratorT,
ValuesInputIteratorT,
ValuesOutputIteratorT,
EqualityOpT,
ScanOpT,
NullType,
OffsetT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_values_out,
equality_op,
scan_op,
NullType(),
num_items,
stream);
}
};
CUB_NAMESPACE_END