/home/runner/work/cccl/cccl/cub/cub/block/block_radix_sort.cuh

File members: /home/runner/work/cccl/cccl/cub/cub/block/block_radix_sort.cuh

/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cub/block/block_exchange.cuh>
#include <cub/block/block_radix_rank.cuh>
#include <cub/block/radix_rank_sort_operations.cuh>
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/type_traits>

CUB_NAMESPACE_BEGIN

template <typename KeyT,
          int BLOCK_DIM_X,
          int ITEMS_PER_THREAD,
          typename ValueT                         = NullType,
          int RADIX_BITS                          = 4,
          bool MEMOIZE_OUTER_SCAN                 = true,
          BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
          cudaSharedMemConfig SMEM_CONFIG         = cudaSharedMemBankSizeFourByte,
          int BLOCK_DIM_Y                         = 1,
          int BLOCK_DIM_Z                         = 1,
          int LEGACY_PTX_ARCH                     = 0>
class BlockRadixSort
{
private:
  /******************************************************************************
   * Constants and type definitions
   ******************************************************************************/

  enum
  {
    // The thread block size in threads
    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,

    // Whether or not there are values to be trucked along with keys
    KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value,
  };

  // KeyT traits and unsigned bits type
  using traits                 = detail::radix::traits_t<KeyT>;
  using bit_ordered_type       = typename traits::bit_ordered_type;
  using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;

  using AscendingBlockRadixRank =
    BlockRadixRank<BLOCK_DIM_X,
                   RADIX_BITS,
                   false,
                   MEMOIZE_OUTER_SCAN,
                   INNER_SCAN_ALGORITHM,
                   SMEM_CONFIG,
                   BLOCK_DIM_Y,
                   BLOCK_DIM_Z>;

  using DescendingBlockRadixRank =
    BlockRadixRank<BLOCK_DIM_X,
                   RADIX_BITS,
                   true,
                   MEMOIZE_OUTER_SCAN,
                   INNER_SCAN_ALGORITHM,
                   SMEM_CONFIG,
                   BLOCK_DIM_Y,
                   BLOCK_DIM_Z>;

  using fundamental_digit_extractor_t = BFEDigitExtractor<KeyT>;

  using BlockExchangeKeys = BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;

  using BlockExchangeValues = BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
  union _TempStorage
  {
    typename AscendingBlockRadixRank::TempStorage asending_ranking_storage;
    typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
    typename BlockExchangeKeys::TempStorage exchange_keys;
    typename BlockExchangeValues::TempStorage exchange_values;
  };
#endif // DOXYGEN_SHOULD_SKIP_THIS

  /******************************************************************************
   * Thread fields
   ******************************************************************************/

  _TempStorage& temp_storage;

  unsigned int linear_tid;

  /******************************************************************************
   * Utility methods
   ******************************************************************************/

  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
  {
    __shared__ _TempStorage private_storage;
    return private_storage;
  }

  template <class DigitExtractorT>
  _CCCL_DEVICE _CCCL_FORCEINLINE void
  RankKeys(bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD],
           int (&ranks)[ITEMS_PER_THREAD],
           DigitExtractorT digit_extractor,
           Int2Type<false> /*is_descending*/)
  {
    AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
  }

  template <class DigitExtractorT>
  _CCCL_DEVICE _CCCL_FORCEINLINE void
  RankKeys(bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD],
           int (&ranks)[ITEMS_PER_THREAD],
           DigitExtractorT digit_extractor,
           Int2Type<true> /*is_descending*/)
  {
    DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
  }

  _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
    ValueT (&values)[ITEMS_PER_THREAD],
    int (&ranks)[ITEMS_PER_THREAD],
    Int2Type<false> /*is_keys_only*/,
    Int2Type<true> /*is_blocked*/)
  {
    CTA_SYNC();

    // Exchange values through shared memory in blocked arrangement
    BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
  }

  _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
    ValueT (&values)[ITEMS_PER_THREAD],
    int (&ranks)[ITEMS_PER_THREAD],
    Int2Type<false> /*is_keys_only*/,
    Int2Type<false> /*is_blocked*/)
  {
    CTA_SYNC();

    // Exchange values through shared memory in blocked arrangement
    BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
  }

  template <int IS_BLOCKED>
  _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
    ValueT (& /*values*/)[ITEMS_PER_THREAD],
    int (& /*ranks*/)[ITEMS_PER_THREAD],
    Int2Type<true> /*is_keys_only*/,
    Int2Type<IS_BLOCKED> /*is_blocked*/)
  {}

  template <int DESCENDING, int KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
  _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlocked(
    KeyT (&keys)[ITEMS_PER_THREAD],
    ValueT (&values)[ITEMS_PER_THREAD],
    int begin_bit,
    int end_bit,
    Int2Type<DESCENDING> is_descending,
    Int2Type<KEYS_ONLY> is_keys_only,
    DecomposerT decomposer = {})
  {
    bit_ordered_type(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast<bit_ordered_type(&)[ITEMS_PER_THREAD]>(keys);

#pragma unroll
    for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
    {
      unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
    }

    // Radix sorting passes
    while (true)
    {
      int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
      auto digit_extractor =
        traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);

      // Rank the blocked keys
      int ranks[ITEMS_PER_THREAD];
      RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
      begin_bit += RADIX_BITS;

      CTA_SYNC();

      // Exchange keys through shared memory in blocked arrangement
      BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);

      // Exchange values through shared memory in blocked arrangement
      ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());

      // Quit if done
      if (begin_bit >= end_bit)
      {
        break;
      }

      CTA_SYNC();
    }

// Untwiddle bits if necessary
#pragma unroll
    for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
    {
      unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
    }
  }

public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document

  template <int DESCENDING, int KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
  _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
    KeyT (&keys)[ITEMS_PER_THREAD],
    ValueT (&values)[ITEMS_PER_THREAD],
    int begin_bit,
    int end_bit,
    Int2Type<DESCENDING> is_descending,
    Int2Type<KEYS_ONLY> is_keys_only,
    DecomposerT decomposer = {})
  {
    bit_ordered_type(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast<bit_ordered_type(&)[ITEMS_PER_THREAD]>(keys);

#  pragma unroll
    for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
    {
      unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
    }

    // Radix sorting passes
    while (true)
    {
      int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
      auto digit_extractor =
        traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);

      // Rank the blocked keys
      int ranks[ITEMS_PER_THREAD];
      RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
      begin_bit += RADIX_BITS;

      CTA_SYNC();

      // Check if this is the last pass
      if (begin_bit >= end_bit)
      {
        // Last pass exchanges keys through shared memory in striped arrangement
        BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);

        // Last pass exchanges through shared memory in striped arrangement
        ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());

        // Quit
        break;
      }

      // Exchange keys through shared memory in blocked arrangement
      BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);

      // Exchange values through shared memory in blocked arrangement
      ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());

      CTA_SYNC();
    }

// Untwiddle bits if necessary
#  pragma unroll
    for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
    {
      unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
    }
  }

#endif // DOXYGEN_SHOULD_SKIP_THIS

  struct TempStorage : Uninitialized<_TempStorage>
  {};

  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort()
      : temp_storage(PrivateStorage())
      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
  {}

  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort(TempStorage& temp_storage)
      : temp_storage(temp_storage.Alias())
      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
  {}

  _CCCL_DEVICE _CCCL_FORCEINLINE void
  Sort(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
  {
    NullType values[ITEMS_PER_THREAD];

    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    Sort(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
  {
    NullType values[ITEMS_PER_THREAD];

    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>(), decomposer);
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    Sort(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
  {
    Sort(keys, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
  }

  _CCCL_DEVICE _CCCL_FORCEINLINE void
  Sort(KeyT (&keys)[ITEMS_PER_THREAD],
       ValueT (&values)[ITEMS_PER_THREAD],
       int begin_bit = 0,
       int end_bit   = sizeof(KeyT) * 8)
  {
    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    Sort(KeyT (&keys)[ITEMS_PER_THREAD],
         ValueT (&values)[ITEMS_PER_THREAD],
         DecomposerT decomposer,
         int begin_bit,
         int end_bit)
  {
    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>(), decomposer);
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
  {
    Sort(keys, values, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
  }

  _CCCL_DEVICE _CCCL_FORCEINLINE void
  SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
  {
    NullType values[ITEMS_PER_THREAD];

    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
  {
    NullType values[ITEMS_PER_THREAD];

    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>(), decomposer);
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
  {
    NullType values[ITEMS_PER_THREAD];

    SortBlocked(
      keys,
      values,
      0,
      detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
      Int2Type<true>(),
      Int2Type<KEYS_ONLY>(),
      decomposer);
  }

  _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescending(
    KeyT (&keys)[ITEMS_PER_THREAD],
    ValueT (&values)[ITEMS_PER_THREAD],
    int begin_bit = 0,
    int end_bit   = sizeof(KeyT) * 8)
  {
    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
                   ValueT (&values)[ITEMS_PER_THREAD],
                   DecomposerT decomposer,
                   int begin_bit,
                   int end_bit)
  {
    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>(), decomposer);
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
  {
    SortBlocked(
      keys,
      values,
      0,
      detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
      Int2Type<true>(),
      Int2Type<KEYS_ONLY>(),
      decomposer);
  }

  _CCCL_DEVICE _CCCL_FORCEINLINE void
  SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
  {
    NullType values[ITEMS_PER_THREAD];

    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
  {
    NullType values[ITEMS_PER_THREAD];

    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>(), decomposer);
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
  {
    NullType values[ITEMS_PER_THREAD];

    SortBlockedToStriped(
      keys,
      values,
      0,
      detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
      Int2Type<false>(),
      Int2Type<KEYS_ONLY>(),
      decomposer);
  }

  _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
    KeyT (&keys)[ITEMS_PER_THREAD],
    ValueT (&values)[ITEMS_PER_THREAD],
    int begin_bit = 0,
    int end_bit   = sizeof(KeyT) * 8)
  {
    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
                         ValueT (&values)[ITEMS_PER_THREAD],
                         DecomposerT decomposer,
                         int begin_bit,
                         int end_bit)
  {
    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>(), decomposer);
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
  {
    SortBlockedToStriped(
      keys,
      values,
      0,
      detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
      Int2Type<false>(),
      Int2Type<KEYS_ONLY>(),
      decomposer);
  }

  _CCCL_DEVICE _CCCL_FORCEINLINE void
  SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
  {
    NullType values[ITEMS_PER_THREAD];

    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
  {
    NullType values[ITEMS_PER_THREAD];

    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>(), decomposer);
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
  {
    NullType values[ITEMS_PER_THREAD];

    SortBlockedToStriped(
      keys,
      values,
      0,
      detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
      Int2Type<true>(),
      Int2Type<KEYS_ONLY>(),
      decomposer);
  }

  _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescendingBlockedToStriped(
    KeyT (&keys)[ITEMS_PER_THREAD],
    ValueT (&values)[ITEMS_PER_THREAD],
    int begin_bit = 0,
    int end_bit   = sizeof(KeyT) * 8)
  {
    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    SortDescendingBlockedToStriped(
      KeyT (&keys)[ITEMS_PER_THREAD],
      ValueT (&values)[ITEMS_PER_THREAD],
      DecomposerT decomposer,
      int begin_bit,
      int end_bit)
  {
    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>(), decomposer);
  }

  template <class DecomposerT>
  _CCCL_DEVICE _CCCL_FORCEINLINE //
    typename ::cuda::std::enable_if< //
      !::cuda::std::is_convertible<DecomposerT, int>::value>::type
    SortDescendingBlockedToStriped(
      KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
  {
    SortBlockedToStriped(
      keys,
      values,
      0,
      detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
      Int2Type<true>(),
      Int2Type<KEYS_ONLY>(),
      decomposer);
  }

};

CUB_NAMESPACE_END