cub::DeviceFor

struct DeviceFor

Public Static Functions

template<class ShapeT, class OpT> static inline cudaError_t Bulk(void *d_temp_storage, size_t &temp_storage_bytes, ShapeT shape, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each index in the provided shape The algorithm is similar to bulk from P2300.

The return value of op, if any, is ignored.
When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use Bulk to square each element in a device vector.

struct square_t
{
  int* d_ptr;

  __device__ void operator()(int i)
  {
    d_ptr[i] *= d_ptr[i];
  }
};

c2h::device_vector<int> vec = {1, 2, 3, 4};
square_t op{thrust::raw_pointer_cast(vec.data())};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);

// 2) Allocate temp storage
c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform bulk operation
cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);

c2h::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters

ShapeT – is an integral type
OpT – is a model of Unary Function

Parameters

d_temp_storage – [in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.
temp_storage_bytes – [inout] Reference to size in bytes of d_temp_storage allocation
shape – [in] Shape of the index space to iterate over
op – [in] Function object to apply to each index in the index space
stream – [in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class NumItemsT, class OpT> static inline cudaError_t ForEachN(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, first + num_items)

The return value of op, if any, is ignored.
When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use ForEachN to square each element in a device vector.

struct square_ref_t
{
  __device__ void operator()(int& i)
  {
    i *= i;
  }
};

c2h::device_vector<int> vec = {1, 2, 3, 4};
square_ref_t op{};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);

// 2) Allocate temp storage
c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform for each n operation
cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);

c2h::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters

RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.
NumItemsT – is an integral type representing the number of elements to iterate over
OpT – is a model of Unary Function

Parameters

d_temp_storage – [in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.
temp_storage_bytes – [inout] Reference to size in bytes of d_temp_storage allocation
first – [in] The beginning of the sequence
num_items – [in] Number of elements to iterate over
op – [in] Function object to apply to each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class OpT> static inline cudaError_t ForEach(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, last)

The return value of op, if any, is ignored.
When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use ForEach to square each element in a device vector.

struct square_ref_t
{
  __device__ void operator()(int& i)
  {
    i *= i;
  }
};

c2h::device_vector<int> vec = {1, 2, 3, 4};
square_ref_t op{};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);

// 2) Allocate temp storage
c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform for each operation
cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);

c2h::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters

RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.
OpT – is a model of Unary Function

Parameters

d_temp_storage – [in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.
temp_storage_bytes – [inout] Reference to size in bytes of d_temp_storage allocation
first – [in] The beginning of the sequence
last – [in] The end of the sequence
op – [in] Function object to apply to each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class NumItemsT, class OpT> static inline cudaError_t ForEachCopyN(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, first + num_items). Unlike the ForEachN algorithm, ForEachCopyN is allowed to invoke op on copies of the elements. This relaxation allows ForEachCopyN to vectorize loads.

Allowed to invoke op on copies of the elements
The return value of op, if any, is ignored.
When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use ForEachCopyN to count odd elements in a device vector.

struct odd_count_t
{
  int* d_count;

  __device__ void operator()(int i)
  {
    if (i % 2 == 1)
    {
      atomicAdd(d_count, 1);
    }
  }
};

c2h::device_vector<int> vec = {1, 2, 3, 4};
c2h::device_vector<int> count(1);
odd_count_t op{thrust::raw_pointer_cast(count.data())};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);

// 2) Allocate temp storage
c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform for each n operation
cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);

c2h::device_vector<int> expected = {2};

Template Parameters

RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.
NumItemsT – is an integral type representing the number of elements to iterate over
OpT – is a model of Unary Function

Parameters

d_temp_storage – [in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.
temp_storage_bytes – [inout] Reference to size in bytes of d_temp_storage allocation
first – [in] The beginning of the sequence
num_items – [in] Number of elements to iterate over
op – [in] Function object to apply to a copy of each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class OpT> static inline cudaError_t ForEachCopy(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, last). Unlike the ForEach algorithm, ForEachCopy is allowed to invoke op on copies of the elements. This relaxation allows ForEachCopy to vectorize loads.

Allowed to invoke op on copies of the elements
The return value of op, if any, is ignored.
When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use ForEachCopy to count odd elements in a device vector.

struct odd_count_t
{
  int* d_count;

  __device__ void operator()(int i)
  {
    if (i % 2 == 1)
    {
      atomicAdd(d_count, 1);
    }
  }
};

c2h::device_vector<int> vec = {1, 2, 3, 4};
c2h::device_vector<int> count(1);
odd_count_t op{thrust::raw_pointer_cast(count.data())};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);

// 2) Allocate temp storage
c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform for each n operation
cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);

c2h::device_vector<int> expected = {2};

Template Parameters

RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.
OpT – is a model of Unary Function

Parameters

d_temp_storage – [in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.
temp_storage_bytes – [inout] Reference to size in bytes of d_temp_storage allocation
first – [in] The beginning of the sequence
last – [in] The end of the sequence
op – [in] Function object to apply to a copy of each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is 0.

template<class ShapeT, class OpT> static inline cudaError_t Bulk(ShapeT shape, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each index in the provided shape The algorithm is similar to bulk from P2300.

The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use Bulk to square each element in a device vector.

struct square_t
{
  int* d_ptr;

  __device__ void operator()(int i)
  {
    d_ptr[i] *= d_ptr[i];
  }
};

c2h::device_vector<int> vec = {1, 2, 3, 4};
square_t op{thrust::raw_pointer_cast(vec.data())};

cub::DeviceFor::Bulk(vec.size(), op);

c2h::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters

ShapeT – is an integral type
OpT – is a model of Unary Function

Parameters

shape – [in] Shape of the index space to iterate over
op – [in] Function object to apply to each index in the index space
stream – [in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class NumItemsT, class OpT> static inline cudaError_t ForEachN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, first + num_items)

The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use ForEachN to square each element in a device vector.

struct square_ref_t
{
  __device__ void operator()(int& i)
  {
    i *= i;
  }
};

c2h::device_vector<int> vec = {1, 2, 3, 4};
square_ref_t op{};

cub::DeviceFor::ForEachN(vec.begin(), vec.size(), op);

c2h::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters

RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.
NumItemsT – is an integral type representing the number of elements to iterate over
OpT – is a model of Unary Function

Parameters

first – [in] The beginning of the sequence
num_items – [in] Number of elements to iterate over
op – [in] Function object to apply to each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class OpT> static inline cudaError_t ForEach(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, last)

The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use ForEach to square each element in a device vector.

struct square_ref_t
{
  __device__ void operator()(int& i)
  {
    i *= i;
  }
};

c2h::device_vector<int> vec = {1, 2, 3, 4};
square_ref_t op{};

cub::DeviceFor::ForEach(vec.begin(), vec.end(), op);

c2h::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters

RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.
OpT – is a model of Unary Function

Parameters

first – [in] The beginning of the sequence
last – [in] The end of the sequence
op – [in] Function object to apply to each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class NumItemsT, class OpT> static inline cudaError_t ForEachCopyN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, first + num_items). Unlike the ForEachN algorithm, ForEachCopyN is allowed to invoke op on copies of the elements. This relaxation allows ForEachCopyN to vectorize loads.

Allowed to invoke op on copies of the elements
The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use ForEachCopyN to count odd elements in a device vector.

struct odd_count_t
{
  int* d_count;

  __device__ void operator()(int i)
  {
    if (i % 2 == 1)
    {
      atomicAdd(d_count, 1);
    }
  }
};

c2h::device_vector<int> vec = {1, 2, 3, 4};
c2h::device_vector<int> count(1);
odd_count_t op{thrust::raw_pointer_cast(count.data())};

cub::DeviceFor::ForEachCopyN(vec.begin(), vec.size(), op);

c2h::device_vector<int> expected = {2};

Template Parameters

RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.
NumItemsT – is an integral type representing the number of elements to iterate over
OpT – is a model of Unary Function

Parameters

first – [in] The beginning of the sequence
num_items – [in] Number of elements to iterate over
op – [in] Function object to apply to a copy of each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class OpT> static inline cudaError_t ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, last). Unlike the ForEach algorithm, ForEachCopy is allowed to invoke op on copies of the elements. This relaxation allows ForEachCopy to vectorize loads.

Allowed to invoke op on copies of the elements
The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use ForEachCopy to count odd elements in a device vector.

struct odd_count_t
{
  int* d_count;

  __device__ void operator()(int i)
  {
    if (i % 2 == 1)
    {
      atomicAdd(d_count, 1);
    }
  }
};

c2h::device_vector<int> vec = {1, 2, 3, 4};
c2h::device_vector<int> count(1);
odd_count_t op{thrust::raw_pointer_cast(count.data())};

cub::DeviceFor::ForEachCopy(vec.begin(), vec.end(), op);

c2h::device_vector<int> expected = {2};

Template Parameters

RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.
OpT – is a model of Unary Function

Parameters

first – [in] The beginning of the sequence
last – [in] The end of the sequence
op – [in] Function object to apply to a copy of each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is 0.

template<typename IndexType, size_t... Extents, typename OpType> static inline cudaError_t ForEachInExtents(void *d_temp_storage, size_t &temp_storage_bytes, const cuda::std::extents<IndexType, Extents...> &extents, OpType op, cudaStream_t stream = {})

Overview

Iterate through a multi-dimensional extents into

a single linear index that represents the current iteration
indices of each extent dimension

Then apply a function object to the results.

The return value of op, if any, is ignored.

Note: DeviceFor::ForEachInExtents supports integral index type up to 64-bits.

A Simple Example

The following code snippet demonstrates how to use ForEachInExtents to tabulate a 3D array with its coordinates.

struct linear_store_3D
{
  using data_t = cuda::std::array<int, 3>;

  cuda::std::span<data_t> d_output_raw;

  __device__ void operator()(int idx, int x, int y, int z)
  {
    d_output_raw[idx] = {x, y, z};
  }
};

using                            data_t = cuda::std::array<int, 3>;
cuda::std::extents<int, 3, 2, 2> extents{};
c2h::device_vector<data_t>    d_output(cub::detail::size(extents));
c2h::host_vector<data_t>      h_output(cub::detail::size(extents));
auto                             d_output_raw = cuda::std::span<data_t>{thrust::raw_pointer_cast(d_output.data()),
                                                                        3 * 2 * 2};
c2h::host_vector<data_t> expected = {{0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1},
                                        {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1},
                                        {2, 0, 0}, {2, 0, 1}, {2, 1, 0}, {2, 1, 1}};

cub::DeviceFor::ForEachInExtents(extents, [=] __device__ (int idx, int x, int y, int z) {
  d_output_raw[idx] = {x, y, z};
});
h_output = d_output;
REQUIRE(h_output == expected);

thrust::fill(d_output.begin(), d_output.end(), data_t{});
cub::DeviceFor::ForEachInExtents(extents, linear_store_3D{d_output_raw});

h_output = d_output;
REQUIRE(h_output == expected);

Template Parameters

IndexType – is an integral type that represents the extent index space (automatically deduced)
Extents – are the extent sizes for each rank index (automatically deduced)
OpType – is a function object with arity equal to the number of extents + 1 for the linear index (iteration)

Parameters

d_temp_storage – [in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.
temp_storage_bytes – [inout] Reference to size in bytes of d_temp_storage allocation
extents – [in] Extents object that represents a multi-dimensional index space
op – [in] Function object to apply to each linear index (iteration) and multi-dimensional coordinates
stream – [in] CUDA stream to launch kernels within. Default stream is NULL

Returns

cudaError_t error status

template<typename IndexType, size_t... Extents, typename OpType> static inline cudaError_t ForEachInExtents(const cuda::std::extents<IndexType, Extents...> &extents, OpType op, cudaStream_t stream = {})

Overview

Iterate through a multi-dimensional extents producing

a single linear index that represents the current iteration
list of indices containing the coordinates for each extent dimension

Then apply a function object to each tuple of linear index and multidimensional coordinate list.

The return value of op, if any, is ignored.

Note: DeviceFor::ForEachInExtents supports integral index type up to 64-bits.

A Simple Example

The following code snippet demonstrates how to use ForEachInExtents to tabulate a 3D array with its coordinates.

struct linear_store_3D
{
  using data_t = cuda::std::array<int, 3>;

  cuda::std::span<data_t> d_output_raw;

  __device__ void operator()(int idx, int x, int y, int z)
  {
    d_output_raw[idx] = {x, y, z};
  }
};

using                            data_t = cuda::std::array<int, 3>;
cuda::std::extents<int, 3, 2, 2> extents{};
c2h::device_vector<data_t>    d_output(cub::detail::size(extents));
c2h::host_vector<data_t>      h_output(cub::detail::size(extents));
auto                             d_output_raw = cuda::std::span<data_t>{thrust::raw_pointer_cast(d_output.data()),
                                                                        3 * 2 * 2};
c2h::host_vector<data_t> expected = {{0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1},
                                        {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1},
                                        {2, 0, 0}, {2, 0, 1}, {2, 1, 0}, {2, 1, 1}};

cub::DeviceFor::ForEachInExtents(extents, [=] __device__ (int idx, int x, int y, int z) {
  d_output_raw[idx] = {x, y, z};
});
h_output = d_output;
REQUIRE(h_output == expected);

thrust::fill(d_output.begin(), d_output.end(), data_t{});
cub::DeviceFor::ForEachInExtents(extents, linear_store_3D{d_output_raw});

h_output = d_output;
REQUIRE(h_output == expected);

Template Parameters

IndexType – is an integral type that represents the extent index space (automatically deduced)
Extents – are the extent sizes for each rank index (automatically deduced)
OpType – is a function object with arity equal to the number of extents + 1 for the linear index (iteration)

Parameters

extents – [in] Extents object that represents a multi-dimensional index space
op – [in] Function object to apply to each linear index (iteration) and multi-dimensional coordinates
stream – [in] CUDA stream to launch kernels within. Default stream is NULL

Returns

cudaError_t error status