grid_even_share.cuh

File members: cub/grid/grid_even_share.cuh
/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cub/grid/grid_mapping.cuh>
#include <cub/util_math.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/__algorithm_>

CUB_NAMESPACE_BEGIN

template <typename OffsetT>
struct GridEvenShare
{
private:
  int total_tiles;
  int big_shares;
  OffsetT big_share_items;
  OffsetT normal_share_items;
  OffsetT normal_base_offset;

public:
  OffsetT num_items;

  int grid_size;

  OffsetT block_offset;

  OffsetT block_end;

  OffsetT block_stride;

  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE GridEvenShare()
      : total_tiles(0)
      , big_shares(0)
      , big_share_items(0)
      , normal_share_items(0)
      , normal_base_offset(0)
      , num_items(0)
      , grid_size(0)
      , block_offset(0)
      , block_end(0)
      , block_stride(0)
  {}

  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void DispatchInit(OffsetT num_items_, int max_grid_size, int tile_items)
  {
    this->block_offset      = num_items_; // Initialize past-the-end
    this->block_end         = num_items_; // Initialize past-the-end
    this->num_items         = num_items_;
    this->total_tiles       = _CUDA_VSTD::max(1, static_cast<int>(::cuda::ceil_div(num_items_, tile_items)));
    this->grid_size         = _CUDA_VSTD::min(total_tiles, max_grid_size);
    int avg_tiles_per_block = total_tiles / grid_size;
    // leftover grains go to big blocks:
    this->big_shares         = total_tiles - (avg_tiles_per_block * grid_size);
    this->normal_share_items = static_cast<OffsetT>(avg_tiles_per_block) * tile_items;
    this->normal_base_offset = static_cast<OffsetT>(big_shares) * tile_items;
    this->big_share_items    = normal_share_items + tile_items;
  }

  template <int TILE_ITEMS>
  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockInit(int block_id, detail::constant_t<GRID_MAPPING_RAKE> /*strategy_tag*/)
  {
    block_stride = TILE_ITEMS;
    if (block_id < big_shares)
    {
      // This thread block gets a big share of grains (avg_tiles_per_block + 1)
      block_offset = (block_id * big_share_items);
      block_end    = block_offset + big_share_items;
    }
    else if (block_id < total_tiles)
    {
      // This thread block gets a normal share of grains (avg_tiles_per_block)
      block_offset = normal_base_offset + (block_id * normal_share_items);
      // Avoid generating values greater than num_items, as it may cause overflow
      block_end = block_offset + _CUDA_VSTD::min(num_items - block_offset, normal_share_items);
    }
    // Else default past-the-end
  }

  template <int TILE_ITEMS>
  _CCCL_DEVICE _CCCL_FORCEINLINE void
  BlockInit(int block_id, detail::constant_t<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
  {
    block_stride = grid_size * TILE_ITEMS;
    block_offset = (block_id * TILE_ITEMS);
    block_end    = num_items;
  }

  template <int TILE_ITEMS, GridMappingStrategy STRATEGY>
  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockInit()
  {
    BlockInit<TILE_ITEMS>(blockIdx.x, detail::constant_v<STRATEGY>);
  }

  template <int TILE_ITEMS, typename OffsetT1 = OffsetT>
  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockInit(OffsetT1 block_offset, OffsetT1 block_end)
  {
    this->block_offset = block_offset;
    this->block_end    = block_end;
    this->block_stride = TILE_ITEMS;
  }
};

CUB_NAMESPACE_END