cub/device/device_histogram.cuh
File members: cub/device/device_histogram.cuh
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#pragma once
#include <cub/config.cuh>
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header
#include <cub/detail/nvtx.cuh>
#include <cub/device/dispatch/dispatch_histogram.cuh>
#include <cub/util_deprecated.cuh>
#include <iterator>
#include <limits>
#include <stdio.h>
CUB_NAMESPACE_BEGIN
struct DeviceHistogram
{
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram,
int num_levels,
LevelT lower_level,
LevelT upper_level,
OffsetT num_samples,
cudaStream_t stream = 0)
{
using SampleT = cub::detail::value_t<SampleIteratorT>;
return MultiHistogramEven<1, 1>(
d_temp_storage,
temp_storage_bytes,
d_samples,
&d_histogram,
&num_levels,
&lower_level,
&upper_level,
num_samples,
static_cast<OffsetT>(1),
sizeof(SampleT) * num_samples,
stream);
}
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram,
int num_levels,
LevelT lower_level,
LevelT upper_level,
OffsetT num_samples,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return HistogramEven(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_samples,
stream);
}
#endif // _CCCL_DOXYGEN_INVOKED
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram,
int num_levels,
LevelT lower_level,
LevelT upper_level,
OffsetT num_row_samples,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = 0)
{
return MultiHistogramEven<1, 1>(
d_temp_storage,
temp_storage_bytes,
d_samples,
&d_histogram,
&num_levels,
&lower_level,
&upper_level,
num_row_samples,
num_rows,
row_stride_bytes,
stream);
}
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram,
int num_levels,
LevelT lower_level,
LevelT upper_level,
OffsetT num_row_samples,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return HistogramEven(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_row_samples,
num_rows,
row_stride_bytes,
stream);
}
#endif // _CCCL_DOXYGEN_INVOKED
template <int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT>
CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
const int num_levels[NUM_ACTIVE_CHANNELS],
const LevelT lower_level[NUM_ACTIVE_CHANNELS],
const LevelT upper_level[NUM_ACTIVE_CHANNELS],
OffsetT num_pixels,
cudaStream_t stream = 0)
{
using SampleT = cub::detail::value_t<SampleIteratorT>;
return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_pixels,
static_cast<OffsetT>(1),
sizeof(SampleT) * NUM_CHANNELS * num_pixels,
stream);
}
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
template <int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
const int num_levels[NUM_ACTIVE_CHANNELS],
const LevelT lower_level[NUM_ACTIVE_CHANNELS],
const LevelT upper_level[NUM_ACTIVE_CHANNELS],
OffsetT num_pixels,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return MultiHistogramEven(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_pixels,
stream);
}
#endif // _CCCL_DOXYGEN_INVOKED
template <int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT>
CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
const int num_levels[NUM_ACTIVE_CHANNELS],
const LevelT lower_level[NUM_ACTIVE_CHANNELS],
const LevelT upper_level[NUM_ACTIVE_CHANNELS],
OffsetT num_row_pixels,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramEven");
using SampleT = cub::detail::value_t<SampleIteratorT>;
Int2Type<sizeof(SampleT) == 1> is_byte_sample;
_CCCL_IF_CONSTEXPR (sizeof(OffsetT) > sizeof(int))
{
if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
{
// Down-convert OffsetT data type
return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
(int) num_row_pixels,
(int) num_rows,
(int) (row_stride_bytes / sizeof(SampleT)),
stream,
is_byte_sample);
}
}
return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_row_pixels,
num_rows,
(OffsetT) (row_stride_bytes / sizeof(SampleT)),
stream,
is_byte_sample);
}
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
template <int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
const int num_levels[NUM_ACTIVE_CHANNELS],
const LevelT lower_level[NUM_ACTIVE_CHANNELS],
const LevelT upper_level[NUM_ACTIVE_CHANNELS],
OffsetT num_row_pixels,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return MultiHistogramEven(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_row_pixels,
num_rows,
row_stride_bytes,
stream);
}
#endif // _CCCL_DOXYGEN_INVOKED
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram,
int num_levels,
const LevelT* d_levels,
OffsetT num_samples,
cudaStream_t stream = 0)
{
using SampleT = cub::detail::value_t<SampleIteratorT>;
return MultiHistogramRange<1, 1>(
d_temp_storage,
temp_storage_bytes,
d_samples,
&d_histogram,
&num_levels,
&d_levels,
num_samples,
(OffsetT) 1,
(size_t) (sizeof(SampleT) * num_samples),
stream);
}
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram,
int num_levels,
const LevelT* d_levels,
OffsetT num_samples,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return HistogramRange(
d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples, stream);
}
#endif // _CCCL_DOXYGEN_INVOKED
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram,
int num_levels,
const LevelT* d_levels,
OffsetT num_row_samples,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = 0)
{
return MultiHistogramRange<1, 1>(
d_temp_storage,
temp_storage_bytes,
d_samples,
&d_histogram,
&num_levels,
&d_levels,
num_row_samples,
num_rows,
row_stride_bytes,
stream);
}
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram,
int num_levels,
const LevelT* d_levels,
OffsetT num_row_samples,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return HistogramRange(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
num_row_samples,
num_rows,
row_stride_bytes,
stream);
}
#endif // _CCCL_DOXYGEN_INVOKED
template <int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT>
CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
const int num_levels[NUM_ACTIVE_CHANNELS],
const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
OffsetT num_pixels,
cudaStream_t stream = 0)
{
using SampleT = cub::detail::value_t<SampleIteratorT>;
return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
num_pixels,
(OffsetT) 1,
(size_t) (sizeof(SampleT) * NUM_CHANNELS * num_pixels),
stream);
}
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
template <int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
const int num_levels[NUM_ACTIVE_CHANNELS],
const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
OffsetT num_pixels,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return MultiHistogramRange(
d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_pixels, stream);
}
#endif // _CCCL_DOXYGEN_INVOKED
template <int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT>
CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
const int num_levels[NUM_ACTIVE_CHANNELS],
const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
OffsetT num_row_pixels,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = 0)
{
CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramRange");
using SampleT = cub::detail::value_t<SampleIteratorT>;
Int2Type<sizeof(SampleT) == 1> is_byte_sample;
_CCCL_IF_CONSTEXPR (sizeof(OffsetT) > sizeof(int))
{
if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
{
// Down-convert OffsetT data type
return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
(int) num_row_pixels,
(int) num_rows,
(int) (row_stride_bytes / sizeof(SampleT)),
stream,
is_byte_sample);
}
}
return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
num_row_pixels,
num_rows,
(OffsetT) (row_stride_bytes / sizeof(SampleT)),
stream,
is_byte_sample);
}
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
template <int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
const int num_levels[NUM_ACTIVE_CHANNELS],
const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
OffsetT num_row_pixels,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return MultiHistogramRange(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
num_row_pixels,
num_rows,
row_stride_bytes,
stream);
}
#endif // _CCCL_DOXYGEN_INVOKED
};
CUB_NAMESPACE_END