cub/warp/warp_utils.cuh

File members: cub/warp/warp_utils.cuh

// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause
#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cuda/cmath>
#include <cuda/ptx>
#include <cuda/std/type_traits>

CUB_NAMESPACE_BEGIN
namespace detail
{

template <int LogicalWarpSize>
inline constexpr bool is_valid_logical_warp_size_v = LogicalWarpSize >= 1 && LogicalWarpSize <= detail::warp_threads;

template <int LogicalWarpSize>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE int
logical_lane_id(_CUDA_VSTD::integral_constant<int, LogicalWarpSize> = {})
{
  static_assert(is_valid_logical_warp_size_v<LogicalWarpSize>, "invalid logical warp size");
  auto lane                             = _CUDA_VPTX::get_sreg_laneid();
  constexpr bool is_full_warp           = LogicalWarpSize == detail::warp_threads;
  constexpr auto is_single_logical_warp = is_full_warp || !::cuda::is_power_of_two(LogicalWarpSize);
  auto logical_lane =
    static_cast<int>(is_single_logical_warp ? lane : (LogicalWarpSize == 1 ? 0 : lane % LogicalWarpSize));
  _CCCL_ASSUME(logical_lane >= 0 && logical_lane < LogicalWarpSize);
  return logical_lane;
}

template <int LogicalWarpSize>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE int
logical_warp_id(_CUDA_VSTD::integral_constant<int, LogicalWarpSize> = {})
{
  static_assert(is_valid_logical_warp_size_v<LogicalWarpSize>, "invalid logical warp size");
  auto lane                             = _CUDA_VPTX::get_sreg_laneid();
  constexpr bool is_full_warp           = LogicalWarpSize == detail::warp_threads;
  constexpr auto is_single_logical_warp = is_full_warp || !::cuda::is_power_of_two(LogicalWarpSize);
  auto logical_warp_id                  = static_cast<int>(is_single_logical_warp ? 0 : lane / LogicalWarpSize);
  _CCCL_ASSUME(logical_warp_id >= 0 && logical_warp_id < detail::warp_threads / LogicalWarpSize);
  return logical_warp_id;
}

template <int LogicalWarpSize>
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE int
logical_warp_base_id(_CUDA_VSTD::integral_constant<int, LogicalWarpSize> logical_warp_size = {})
{
  return cub::detail::logical_warp_id(logical_warp_size) * LogicalWarpSize;
}

} // namespace detail
CUB_NAMESPACE_END