cub/block/block_run_length_decode.cuh

File members: cub/block/block_run_length_decode.cuh

/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cub/block/block_scan.cuh>
#include <cub/thread/thread_search.cuh>
#include <cub/util_math.cuh>
#include <cub/util_namespace.cuh>
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>

#include <limits>
#include <type_traits>

CUB_NAMESPACE_BEGIN

template <typename ItemT,
          int BLOCK_DIM_X,
          int RUNS_PER_THREAD,
          int DECODED_ITEMS_PER_THREAD,
          typename DecodedOffsetT = uint32_t,
          int BLOCK_DIM_Y         = 1,
          int BLOCK_DIM_Z         = 1>
class BlockRunLengthDecode
{
  //---------------------------------------------------------------------
  // CONFIGS & TYPE ALIASES
  //---------------------------------------------------------------------

private:
  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;

  static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD;

  using RunOffsetScanT = BlockScan<DecodedOffsetT, BLOCK_DIM_X, BLOCK_SCAN_RAKING_MEMOIZE, BLOCK_DIM_Y, BLOCK_DIM_Z>;

  using RunOffsetT = uint32_t;

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
  union _TempStorage
  {
    typename RunOffsetScanT::TempStorage offset_scan;
    struct
    {
      ItemT run_values[BLOCK_RUNS];
      DecodedOffsetT run_offsets[BLOCK_RUNS];
    } runs;
  }; // union TempStorage
#endif // _CCCL_DOXYGEN_INVOKED

  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
  {
    __shared__ _TempStorage private_storage;
    return private_storage;
  }

  _TempStorage& temp_storage;

  uint32_t linear_tid;

public:
  struct TempStorage : Uninitialized<_TempStorage>
  {};

  //---------------------------------------------------------------------
  // CONSTRUCTOR
  //---------------------------------------------------------------------

  template <typename RunLengthT, typename TotalDecodedSizeT>
  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRunLengthDecode(
    TempStorage& temp_storage,
    ItemT (&run_values)[RUNS_PER_THREAD],
    RunLengthT (&run_lengths)[RUNS_PER_THREAD],
    TotalDecodedSizeT& total_decoded_size)
      : temp_storage(temp_storage.Alias())
      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
  {
    InitWithRunLengths(run_values, run_lengths, total_decoded_size);
  }

  template <typename UserRunOffsetT>
  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRunLengthDecode(
    TempStorage& temp_storage, ItemT (&run_values)[RUNS_PER_THREAD], UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
      : temp_storage(temp_storage.Alias())
      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
  {
    InitWithRunOffsets(run_values, run_offsets);
  }

  template <typename RunLengthT, typename TotalDecodedSizeT>
  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRunLengthDecode(
    ItemT (&run_values)[RUNS_PER_THREAD],
    RunLengthT (&run_lengths)[RUNS_PER_THREAD],
    TotalDecodedSizeT& total_decoded_size)
      : temp_storage(PrivateStorage())
      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
  {
    InitWithRunLengths(run_values, run_lengths, total_decoded_size);
  }

  template <typename UserRunOffsetT>
  _CCCL_DEVICE _CCCL_FORCEINLINE
  BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD], UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
      : temp_storage(PrivateStorage())
      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
  {
    InitWithRunOffsets(run_values, run_offsets);
  }

private:
  template <int MAX_NUM_ITEMS, typename InputIteratorT, typename OffsetT, typename T>
  _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT StaticUpperBound(InputIteratorT input, OffsetT num_items, T val)
  {
    OffsetT lower_bound = 0;
    OffsetT upper_bound = num_items;
#pragma unroll
    for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
    {
      OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
      mid         = (::cuda::std::min)(mid, num_items - 1);

      if (val < input[mid])
      {
        upper_bound = mid;
      }
      else
      {
        lower_bound = mid + 1;
      }
    }

    return lower_bound;
  }

  template <typename RunOffsetT>
  _CCCL_DEVICE _CCCL_FORCEINLINE void
  InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD], RunOffsetT (&run_offsets)[RUNS_PER_THREAD])
  {
    // Keep the runs' items and the offsets of each run's beginning in the temporary storage
    RunOffsetT thread_dst_offset = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD);
#pragma unroll
    for (int i = 0; i < RUNS_PER_THREAD; i++)
    {
      temp_storage.runs.run_values[thread_dst_offset]  = run_values[i];
      temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i];
      thread_dst_offset++;
    }

    // Ensure run offsets and run values have been written to shared memory
    __syncthreads();
  }

  template <typename RunLengthT, typename TotalDecodedSizeT>
  _CCCL_DEVICE _CCCL_FORCEINLINE void InitWithRunLengths(
    ItemT (&run_values)[RUNS_PER_THREAD],
    RunLengthT (&run_lengths)[RUNS_PER_THREAD],
    TotalDecodedSizeT& total_decoded_size)
  {
    // Compute the offset for the beginning of each run
    DecodedOffsetT run_offsets[RUNS_PER_THREAD];
#pragma unroll
    for (int i = 0; i < RUNS_PER_THREAD; i++)
    {
      run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]);
    }
    DecodedOffsetT decoded_size_aggregate;
    RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate);
    total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);

    // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
    __syncthreads();

    InitWithRunOffsets(run_values, run_offsets);
  }

public:
  template <typename RelativeOffsetT>
  _CCCL_DEVICE _CCCL_FORCEINLINE void RunLengthDecode(
    ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
    RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD],
    DecodedOffsetT from_decoded_offset = 0)
  {
    // The (global) offset of the first item decoded by this thread
    DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD;

    // The run that the first decoded item of this thread belongs to
    // If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the
    // last run
    RunOffsetT assigned_run =
      StaticUpperBound<BLOCK_RUNS>(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset)
      - static_cast<RunOffsetT>(1U);

    DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];

    // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
    DecodedOffsetT assigned_run_end =
      (assigned_run == BLOCK_RUNS - 1)
        ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
        : temp_storage.runs.run_offsets[assigned_run + 1];

    ItemT val = temp_storage.runs.run_values[assigned_run];

#pragma unroll
    for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++)
    {
      decoded_items[i] = val;
      item_offsets[i]  = thread_decoded_offset - assigned_run_begin;

      // A thread only needs to fetch the next run if this was not the last loop iteration
      const bool is_final_loop_iteration = (i + 1 >= DECODED_ITEMS_PER_THREAD);
      if (!is_final_loop_iteration && (thread_decoded_offset == assigned_run_end - 1))
      {
        // We make sure that a thread is not re-entering this conditional when being assigned to the last run already by
        // extending the last run's length to all the thread's item
        assigned_run++;
        assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];

        // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
        assigned_run_end = (assigned_run == BLOCK_RUNS - 1)
                           ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
                           : temp_storage.runs.run_offsets[assigned_run + 1];
        val              = temp_storage.runs.run_values[assigned_run];
      }
      thread_decoded_offset++;
    }
  }

  _CCCL_DEVICE _CCCL_FORCEINLINE void
  RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], DecodedOffsetT from_decoded_offset = 0)
  {
    DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD];
    RunLengthDecode(decoded_items, item_offsets, from_decoded_offset);
  }
};

CUB_NAMESPACE_END