cuda::device::lane_mask
Defined in <cuda/warp>
header.
#include <cuda/std/cstdint>
namespace cuda::device
{
class lane_mask
{
// constructors
__device__ explicit constexpr lane_mask(cuda::std::uint32_t v = 0) noexcept;
// member functions
[[nodiscard]] __device__ constexpr cuda::std::uint32_t value() const noexcept;
// conversion operators
__device__ explicit constexpr operator cuda::std::uint32_t() const noexcept;
// static member functions
[[nodiscard]] __device__ static constexpr lane_mask none() noexcept;
[[nodiscard]] __device__ static constexpr lane_mask all() noexcept;
[[nodiscard]] __device__ static lane_mask all_active() noexcept;
[[nodiscard]] __device__ static lane_mask this_lane() noexcept;
[[nodiscard]] __device__ static lane_mask all_less() noexcept;
[[nodiscard]] __device__ static lane_mask all_less_equal() noexcept;
[[nodiscard]] __device__ static lane_mask all_greater() noexcept;
[[nodiscard]] __device__ static lane_mask all_greater_equal() noexcept;
[[nodiscard]] __device__ static lane_mask all_not_equal() noexcept;
// bitwise assignment operators
__device__ constexpr lane_mask& operator&=(lane_mask mask) noexcept;
__device__ constexpr lane_mask& operator|=(lane_mask mask) noexcept;
__device__ constexpr lane_mask& operator^=(lane_mask mask) noexcept;
__device__ constexpr lane_mask& operator<<=(int shift) noexcept;
__device__ constexpr lane_mask& operator>>=(int shift) noexcept;
// bitwise operators
[[nodiscard]] __device__ friend constexpr lane_mask operator~(lane_mask mask) noexcept;
[[nodiscard]] __device__ friend constexpr lane_mask operator&(lane_mask lhs, lane_mask rhs) noexcept;
[[nodiscard]] __device__ friend constexpr lane_mask operator|(lane_mask lhs, lane_mask rhs) noexcept;
[[nodiscard]] __device__ friend constexpr lane_mask operator^(lane_mask lhs, lane_mask rhs) noexcept;
[[nodiscard]] __device__ friend constexpr lane_mask operator<<(lane_mask mask, int shift) noexcept;
[[nodiscard]] __device__ friend constexpr lane_mask operator>>(lane_mask mask, int shift) noexcept;
// comparison operators
[[nodiscard]] __device__ friend constexpr bool operator==(lane_mask lhs, lane_mask rhs) noexcept;
[[nodiscard]] __device__ friend constexpr bool operator!=(lane_mask lhs, lane_mask rhs) noexcept;
};
} // namespace cuda::device
cuda::device::lane_mask
is a class that represents a mask of lanes in a warp. It is a fancy wrapper around a single 32-bit unsigned integer value that allows for bitwise operations and comparisons, making it easier and safer to work with lane masks in CUDA device code.
The class provides several static
member functions to create common lane masks:
none()
andall()
are equivalent tolane_mask{0x0}
andlane_mask{0xFFFFFFFF}
, respectivelyall_active()
returns a mask with all active lanes in the warp, equivalent to the result__activemask()
, and finallythis_lane()
and other functions likeall_greater()
orall_less_equal()
return masks depending on the current lane index. They are implemented using the PTX special registers.
Preconditions
shift
is in the range[0, 32)
.
Example
#include <cuda/std/cassert>
#include <cuda/std/type_traits>
#include <cuda/warp>
__global__ void lane_mask_kernel() {
// import lane_mask symbol to current scope
using cuda::device::lane_mask;
// all 32 lanes are active in the beginning
assert(lane_mask::all_active() == lane_mask::all());
// this_lane() is equivalent to ~(all_less() | all_greater())
assert(lane_mask::this_lane() == ~(lane_mask::all_less() | lane_mask::all_greater()));
constexpr auto active_lanes = 20u;
constexpr auto active_lanes_mask = lane_mask{(1u << active_lanes) - 1};
// early exit lanes [20, 31]
if (threadIdx.x >= active_lanes)
{
return;
}
// not all lanes are active anymore
assert(lane_mask::all_active() != lane_mask::all());
// only lanes [0, 19] should be active now
assert(lane_mask::all_active() == lane_mask{active_lanes_mask});
}
int main() {
lane_mask_kernel<<<1, 32>>>();
cudaDeviceSynchronize();
return 0;
}