Warp Shuffle

warp_shuffle_idx:

template <int Width = 32, typename T>
[[nodiscard]] __device__ WarpShuffleResult<T>
warp_shuffle_idx(const T& data,
                 int      src_lane,
                 uint32_t lane_mask = 0xFFFFFFFF,
                 cuda::std::integral_constant<int, Width> = {})

template <int Width = 32, typename T>
[[nodiscard]] __device__ WarpShuffleResult<T>
warp_shuffle_idx(const T& data,
                 int      src_lane,
                 cuda::std::integral_constant<int, Width>) // lane_mask is 0xFFFFFFFF

warp_shuffle_up:

template <int Width = 32, typename T>
[[nodiscard]] __device__ WarpShuffleResult<T>
warp_shuffle_up(const T& data,
                int      delta,
                uint32_t lane_mask = 0xFFFFFFFF,
                cuda::std::integral_constant<int, Width> = {})

template <int Width = 32, typename T>
[[nodiscard]] __device__ WarpShuffleResult<T>
warp_shuffle_up(const T& data,
                int      delta,
                cuda::std::integral_constant<int, Width>) // lane_mask is 0xFFFFFFFF

warp_shuffle_down:

template <int Width = 32, typename T>
[[nodiscard]] __device__ WarpShuffleResult<T>
warp_shuffle_down(const T& data,
                  int      delta,
                  uint32_t lane_mask = 0xFFFFFFFF,
                  cuda::std::integral_constant<int, Width> = {})

template <int Width = 32, typename T>
[[nodiscard]] __device__ WarpShuffleResult<T>
warp_shuffle_down(const T& data,
                  int      delta,
                  cuda::std::integral_constant<int, Width>) // lane_mask is 0xFFFFFFFF

warp_shuffle_xor:

template <int Width = 32, typename T>
[[nodiscard]] __device__ WarpShuffleResult<T>
warp_shuffle_xor(const T& data,
                 int      xor_mask,
                 uint32_t lane_mask = 0xFFFFFFFF,
                 cuda::std::integral_constant<int, Width> = {})

template <int Width = 32, typename T>
[[nodiscard]] __device__ WarpShuffleResult<T>
warp_shuffle_xor(const T& data,
                 int      xor_mask,
                 cuda::std::integral_constant<int, Width>) // lane_mask is 0xFFFFFFFF

Result type:

template <typename T>
struct WarpShuffleResult {
    T    data;
    bool pred;

    __device__ operator T() const { return data; }
};

The functionality provides a generalized and safe alternative to CUDA warp shuffle intrinsics. The functions allow to exchange data of any data size, including raw arrays, pointers, and structs.

Parameters

data: data to exchange.
src_lane: source lane.
delta: offset from the source lane.
xor_mask: XOR mask to apply to the source lane.

Return value

WarpShuffleResult:

data: data of the destination lane.
pred: true if the destination lane is within the source lane window. false otherwise.

Constrains

Width must be a power of two in the range [1, 32]
T: only void pointers are allowed to avoid bug-prone code

Preconditions

lane_mask must be a subset of the active mask
The destination lane must be a member of the lane_mask
delta and xor_mask must be less than Width. Modulo behavior is allowed for src_lane
All lanes must have the same value for lane_mask, delta and xor_mask

Performance considerations

The function calls the PTX instruction shfl.sync \(ceil\left(\frac{sizeof(data)}{4}\right)\) times.

References

Example

#include <cuda/std/array>
#include <cuda/std/type_traits>
#include <cuda/warp>
#include <cstdio>

struct MyStruct {
    double x;
    int    y;
};

__global__ void warp_shuffle_kernel() {
    cuda::std::integral_constant<int, 16> half_warp;
    auto                     laneid      = cuda::ptx::get_sreg_laneid();
    int                      raw_array[] = {threadIdx.x, threadIdx.x + 1, threadIdx.x + 2};
    cuda::std::array<int, 3> array       = {threadIdx.x, threadIdx.x + 1, threadIdx.x + 2};
    MyStruct                 my_structs{static_cast<double>(threadIdx.x), threadIdx.x + 1};
    if (laneid < 16) {
        // lanes [0, 15] get an array with values {5, 6, 7}
        auto ret = cuda::device::warp_shuffle_idx(raw_array, 5, 0xFFFF, half_warp);
        printf("lane %2d: [%d, %d, %d]\n", laneid, ret.data[0], ret.data[1], ret.data[2]);

        // lanes [1, 15] get an array with values {threadIdx.x - 1, threadIdx.x, threadIdx.x + 1}
        // lane 0 keeps the original values
        auto array_ret = cuda::device::warp_shuffle_up(array, 1, half_warp).data;
        printf("lane %2d: [%d, %d, %d]\n", laneid, array[0], array[1], array_ret[2]);
    }
    // lanes [0, 13] get my_structs with values {threadIdx.x + 2, threadIdx.x + 3} and pred=true
    auto ret = cuda::device::warp_shuffle_down<16>(my_structs, 2);
    printf("lane %2d: {%f, %d}, pred %d\n", laneid, ret.data.x, ret.data.y, ret.pred);
}

int main() {
    warp_shuffle_kernel<<<1, 32>>>();
    cudaDeviceSynchronize();
    return 0;
}

See it on Godbolt 🔗