/home/runner/work/cccl/cccl/cub/cub/device/device_histogram.cuh

File members: /home/runner/work/cccl/cccl/cub/cub/device/device_histogram.cuh

/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cub/detail/nvtx.cuh>
#include <cub/device/dispatch/dispatch_histogram.cuh>
#include <cub/util_deprecated.cuh>

#include <iterator>
#include <limits>

#include <stdio.h>

CUB_NAMESPACE_BEGIN

struct DeviceHistogram
{

  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram,
    int num_levels,
    LevelT lower_level,
    LevelT upper_level,
    OffsetT num_samples,
    cudaStream_t stream = 0)
  {
    using SampleT = cub::detail::value_t<SampleIteratorT>;
    return MultiHistogramEven<1, 1>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      &d_histogram,
      &num_levels,
      &lower_level,
      &upper_level,
      num_samples,
      static_cast<OffsetT>(1),
      sizeof(SampleT) * num_samples,
      stream);
  }

  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram,
    int num_levels,
    LevelT lower_level,
    LevelT upper_level,
    OffsetT num_samples,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return HistogramEven(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      lower_level,
      upper_level,
      num_samples,
      stream);
  }

  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram,
    int num_levels,
    LevelT lower_level,
    LevelT upper_level,
    OffsetT num_row_samples,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream = 0)
  {
    return MultiHistogramEven<1, 1>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      &d_histogram,
      &num_levels,
      &lower_level,
      &upper_level,
      num_row_samples,
      num_rows,
      row_stride_bytes,
      stream);
  }

  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram,
    int num_levels,
    LevelT lower_level,
    LevelT upper_level,
    OffsetT num_row_samples,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return HistogramEven(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      lower_level,
      upper_level,
      num_row_samples,
      num_rows,
      row_stride_bytes,
      stream);
  }

  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
    const int num_levels[NUM_ACTIVE_CHANNELS],
    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
    OffsetT num_pixels,
    cudaStream_t stream = 0)
  {
    using SampleT = cub::detail::value_t<SampleIteratorT>;

    return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      lower_level,
      upper_level,
      num_pixels,
      static_cast<OffsetT>(1),
      sizeof(SampleT) * NUM_CHANNELS * num_pixels,
      stream);
  }

  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
    const int num_levels[NUM_ACTIVE_CHANNELS],
    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
    OffsetT num_pixels,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return MultiHistogramEven(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      lower_level,
      upper_level,
      num_pixels,
      stream);
  }

  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
    const int num_levels[NUM_ACTIVE_CHANNELS],
    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
    OffsetT num_row_pixels,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream = 0)
  {
    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramEven");

    using SampleT = cub::detail::value_t<SampleIteratorT>;
    Int2Type<sizeof(SampleT) == 1> is_byte_sample;

    _CCCL_IF_CONSTEXPR (sizeof(OffsetT) > sizeof(int))
    {
      if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
      {
        // Down-convert OffsetT data type
        return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
          d_temp_storage,
          temp_storage_bytes,
          d_samples,
          d_histogram,
          num_levels,
          lower_level,
          upper_level,
          (int) num_row_pixels,
          (int) num_rows,
          (int) (row_stride_bytes / sizeof(SampleT)),
          stream,
          is_byte_sample);
      }
    }

    return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      lower_level,
      upper_level,
      num_row_pixels,
      num_rows,
      (OffsetT) (row_stride_bytes / sizeof(SampleT)),
      stream,
      is_byte_sample);
  }

  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
    const int num_levels[NUM_ACTIVE_CHANNELS],
    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
    OffsetT num_row_pixels,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return MultiHistogramEven(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      lower_level,
      upper_level,
      num_row_pixels,
      num_rows,
      row_stride_bytes,
      stream);
  }

  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram,
    int num_levels,
    const LevelT* d_levels,
    OffsetT num_samples,
    cudaStream_t stream = 0)
  {
    using SampleT = cub::detail::value_t<SampleIteratorT>;
    return MultiHistogramRange<1, 1>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      &d_histogram,
      &num_levels,
      &d_levels,
      num_samples,
      (OffsetT) 1,
      (size_t) (sizeof(SampleT) * num_samples),
      stream);
  }

  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram,
    int num_levels,
    const LevelT* d_levels,
    OffsetT num_samples,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return HistogramRange(
      d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples, stream);
  }

  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram,
    int num_levels,
    const LevelT* d_levels,
    OffsetT num_row_samples,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream = 0)
  {
    return MultiHistogramRange<1, 1>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      &d_histogram,
      &num_levels,
      &d_levels,
      num_row_samples,
      num_rows,
      row_stride_bytes,
      stream);
  }

  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram,
    int num_levels,
    const LevelT* d_levels,
    OffsetT num_row_samples,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return HistogramRange(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      d_levels,
      num_row_samples,
      num_rows,
      row_stride_bytes,
      stream);
  }

  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
    const int num_levels[NUM_ACTIVE_CHANNELS],
    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
    OffsetT num_pixels,
    cudaStream_t stream = 0)
  {
    using SampleT = cub::detail::value_t<SampleIteratorT>;

    return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      d_levels,
      num_pixels,
      (OffsetT) 1,
      (size_t) (sizeof(SampleT) * NUM_CHANNELS * num_pixels),
      stream);
  }

  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
    const int num_levels[NUM_ACTIVE_CHANNELS],
    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
    OffsetT num_pixels,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return MultiHistogramRange(
      d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_pixels, stream);
  }

  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
    const int num_levels[NUM_ACTIVE_CHANNELS],
    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
    OffsetT num_row_pixels,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream = 0)
  {
    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramRange");

    using SampleT = cub::detail::value_t<SampleIteratorT>;
    Int2Type<sizeof(SampleT) == 1> is_byte_sample;

    _CCCL_IF_CONSTEXPR (sizeof(OffsetT) > sizeof(int))
    {
      if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
      {
        // Down-convert OffsetT data type
        return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
          d_temp_storage,
          temp_storage_bytes,
          d_samples,
          d_histogram,
          num_levels,
          d_levels,
          (int) num_row_pixels,
          (int) num_rows,
          (int) (row_stride_bytes / sizeof(SampleT)),
          stream,
          is_byte_sample);
      }
    }

    return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      d_levels,
      num_row_pixels,
      num_rows,
      (OffsetT) (row_stride_bytes / sizeof(SampleT)),
      stream,
      is_byte_sample);
  }

  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
    const int num_levels[NUM_ACTIVE_CHANNELS],
    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
    OffsetT num_row_pixels,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream,
    bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return MultiHistogramRange(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      d_levels,
      num_row_pixels,
      num_rows,
      row_stride_bytes,
      stream);
  }

};

CUB_NAMESPACE_END