cub::DeviceFor

Defined in cub/device/device_for.cuh

struct DeviceFor

Public Static Functions

template<class ShapeT, class OpT>
static inline cudaError_t Bulk(void *d_temp_storage, size_t &temp_storage_bytes, ShapeT shape, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each index in the provided shape The algorithm is similar to bulk from P2300.

  • The return value of op, if any, is ignored.

  • When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use Bulk to square each element in a device vector.

struct square_t
{
  int* d_ptr;

  __device__ void operator()(int i)
  {
    d_ptr[i] *= d_ptr[i];
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
square_t op{thrust::raw_pointer_cast(vec.data())};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);

// 2) Allocate temp storage
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform bulk operation
cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);

thrust::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters
Parameters
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • shape[in] Shape of the index space to iterate over

  • op[in] Function object to apply to each index in the index space

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachN(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, first + num_items)

  • The return value of op, if any, is ignored.

  • When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use ForEachN to square each element in a device vector.

struct square_ref_t
{
  __device__ void operator()(int& i)
  {
    i *= i;
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
square_ref_t op{};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);

// 2) Allocate temp storage
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform for each n operation
cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);

thrust::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • NumItemsT – is an integral type representing the number of elements to iterate over

  • OpT – is a model of Unary Function

Parameters
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • first[in] The beginning of the sequence

  • num_items[in] Number of elements to iterate over

  • op[in] Function object to apply to each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEach(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, last)

  • The return value of op, if any, is ignored.

  • When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use ForEach to square each element in a device vector.

struct square_ref_t
{
  __device__ void operator()(int& i)
  {
    i *= i;
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
square_ref_t op{};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);

// 2) Allocate temp storage
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform for each operation
cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);

thrust::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • OpT – is a model of Unary Function

Parameters
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • first[in] The beginning of the sequence

  • last[in] The end of the sequence

  • op[in] Function object to apply to each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachCopyN(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, first + num_items). Unlike the ForEachN algorithm, ForEachCopyN is allowed to invoke op on copies of the elements. This relaxation allows ForEachCopyN to vectorize loads.

  • Allowed to invoke op on copies of the elements

  • The return value of op, if any, is ignored.

  • When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use ForEachCopyN to count odd elements in a device vector.

struct odd_count_t
{
  int* d_count;

  __device__ void operator()(int i)
  {
    if (i % 2 == 1)
    {
      atomicAdd(d_count, 1);
    }
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
thrust::device_vector<int> count(1);
odd_count_t op{thrust::raw_pointer_cast(count.data())};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);

// 2) Allocate temp storage
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform for each n operation
cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);

thrust::device_vector<int> expected = {2};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • NumItemsT – is an integral type representing the number of elements to iterate over

  • OpT – is a model of Unary Function

Parameters
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • first[in] The beginning of the sequence

  • num_items[in] Number of elements to iterate over

  • op[in] Function object to apply to a copy of each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEachCopy(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, last). Unlike the ForEach algorithm, ForEachCopy is allowed to invoke op on copies of the elements. This relaxation allows ForEachCopy to vectorize loads.

  • Allowed to invoke op on copies of the elements

  • The return value of op, if any, is ignored.

  • When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use ForEachCopy to count odd elements in a device vector.

struct odd_count_t
{
  int* d_count;

  __device__ void operator()(int i)
  {
    if (i % 2 == 1)
    {
      atomicAdd(d_count, 1);
    }
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
thrust::device_vector<int> count(1);
odd_count_t op{thrust::raw_pointer_cast(count.data())};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);

// 2) Allocate temp storage
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform for each n operation
cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);

thrust::device_vector<int> expected = {2};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • OpT – is a model of Unary Function

Parameters
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • first[in] The beginning of the sequence

  • last[in] The end of the sequence

  • op[in] Function object to apply to a copy of each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class ShapeT, class OpT>
static inline cudaError_t Bulk(ShapeT shape, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each index in the provided shape The algorithm is similar to bulk from P2300.

  • The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use Bulk to square each element in a device vector.

struct square_t
{
  int* d_ptr;

  __device__ void operator()(int i)
  {
    d_ptr[i] *= d_ptr[i];
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
square_t op{thrust::raw_pointer_cast(vec.data())};

cub::DeviceFor::Bulk(vec.size(), op);

thrust::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters
Parameters
  • shape[in] Shape of the index space to iterate over

  • op[in] Function object to apply to each index in the index space

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, first + num_items)

  • The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use ForEachN to square each element in a device vector.

struct square_ref_t
{
  __device__ void operator()(int& i)
  {
    i *= i;
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
square_ref_t op{};

cub::DeviceFor::ForEachN(vec.begin(), vec.size(), op);

thrust::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • NumItemsT – is an integral type representing the number of elements to iterate over

  • OpT – is a model of Unary Function

Parameters
  • first[in] The beginning of the sequence

  • num_items[in] Number of elements to iterate over

  • op[in] Function object to apply to each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEach(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, last)

  • The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use ForEach to square each element in a device vector.

struct square_ref_t
{
  __device__ void operator()(int& i)
  {
    i *= i;
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
square_ref_t op{};

cub::DeviceFor::ForEach(vec.begin(), vec.end(), op);

thrust::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • OpT – is a model of Unary Function

Parameters
  • first[in] The beginning of the sequence

  • last[in] The end of the sequence

  • op[in] Function object to apply to each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachCopyN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, first + num_items). Unlike the ForEachN algorithm, ForEachCopyN is allowed to invoke op on copies of the elements. This relaxation allows ForEachCopyN to vectorize loads.

  • Allowed to invoke op on copies of the elements

  • The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use ForEachCopyN to count odd elements in a device vector.

struct odd_count_t
{
  int* d_count;

  __device__ void operator()(int i)
  {
    if (i % 2 == 1)
    {
      atomicAdd(d_count, 1);
    }
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
thrust::device_vector<int> count(1);
odd_count_t op{thrust::raw_pointer_cast(count.data())};

cub::DeviceFor::ForEachCopyN(vec.begin(), vec.size(), op);

thrust::device_vector<int> expected = {2};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • NumItemsT – is an integral type representing the number of elements to iterate over

  • OpT – is a model of Unary Function

Parameters
  • first[in] The beginning of the sequence

  • num_items[in] Number of elements to iterate over

  • op[in] Function object to apply to a copy of each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, last). Unlike the ForEach algorithm, ForEachCopy is allowed to invoke op on copies of the elements. This relaxation allows ForEachCopy to vectorize loads.

  • Allowed to invoke op on copies of the elements

  • The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use ForEachCopy to count odd elements in a device vector.

struct odd_count_t
{
  int* d_count;

  __device__ void operator()(int i)
  {
    if (i % 2 == 1)
    {
      atomicAdd(d_count, 1);
    }
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
thrust::device_vector<int> count(1);
odd_count_t op{thrust::raw_pointer_cast(count.data())};

cub::DeviceFor::ForEachCopy(vec.begin(), vec.end(), op);

thrust::device_vector<int> expected = {2};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • OpT – is a model of Unary Function

Parameters
  • first[in] The beginning of the sequence

  • last[in] The end of the sequence

  • op[in] Function object to apply to a copy of each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<typename IndexType, ::cuda::std::size_t... Extents, typename OpType>
static inline cudaError_t ForEachInExtents(void *d_temp_storage, size_t &temp_storage_bytes, const ::cuda::std::extents<IndexType, Extents...> &extents, OpType op, cudaStream_t stream = {})

Overview

Iterate through a multi-dimensional extents into

  • a single linear index that represents the current iteration

  • indices of each extent dimension

Then apply a function object to the results.

  • The return value of op, if any, is ignored.

Note: DeviceFor::ForEachInExtents supports integral index type up to 64-bits.

A Simple Example

The following code snippet demonstrates how to use ForEachInExtents to tabulate a 3D array with its coordinates.

struct linear_store_3D
{
  using data_t = cuda::std::array<int, 3>;

  cuda::std::span<data_t> d_output_raw;

  __device__ void operator()(int idx, int x, int y, int z)
  {
    d_output_raw[idx] = {x, y, z};
  }
};
using                            data_t = cuda::std::array<int, 3>;
cuda::std::extents<int, 3, 2, 2> extents{};
thrust::device_vector<data_t>    d_output(cub::detail::size(extents));
thrust::host_vector<data_t>      h_output(cub::detail::size(extents));
auto                             d_output_raw = cuda::std::span<data_t>{thrust::raw_pointer_cast(d_output.data()),
                                                                        3 * 2 * 2};
thrust::host_vector<data_t> expected = {{0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1},
                                        {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1},
                                        {2, 0, 0}, {2, 0, 1}, {2, 1, 0}, {2, 1, 1}};

cub::DeviceFor::ForEachInExtents(extents, [=] __device__ (int idx, int x, int y, int z) {
  d_output_raw[idx] = {x, y, z};
});
h_output = d_output;
REQUIRE(h_output == expected);

thrust::fill(d_output.begin(), d_output.end(), data_t{});
cub::DeviceFor::ForEachInExtents(extents, linear_store_3D{d_output_raw});

h_output = d_output;
REQUIRE(h_output == expected);

Template Parameters
  • IndexType – is an integral type that represents the extent index space (automatically deduced)

  • Extents – are the extent sizes for each rank index (automatically deduced)

  • OpType – is a function object with arity equal to the number of extents + 1 for the linear index (iteration)

Parameters
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • extents[in] Extents object that represents a multi-dimensional index space

  • op[in] Function object to apply to each linear index (iteration) and multi-dimensional coordinates

  • stream[in] CUDA stream to launch kernels within. Default stream is NULL

Returns

cudaError_t error status

template<typename IndexType, ::cuda::std::size_t... Extents, typename OpType>
static inline cudaError_t ForEachInExtents(const ::cuda::std::extents<IndexType, Extents...> &extents, OpType op, cudaStream_t stream = {})

Overview

Iterate through a multi-dimensional extents producing

  • a single linear index that represents the current iteration

  • list of indices containing the coordinates for each extent dimension

Then apply a function object to each tuple of linear index and multidimensional coordinate list.

  • The return value of op, if any, is ignored.

Note: DeviceFor::ForEachInExtents supports integral index type up to 64-bits.

A Simple Example

The following code snippet demonstrates how to use ForEachInExtents to tabulate a 3D array with its coordinates.

struct linear_store_3D
{
  using data_t = cuda::std::array<int, 3>;

  cuda::std::span<data_t> d_output_raw;

  __device__ void operator()(int idx, int x, int y, int z)
  {
    d_output_raw[idx] = {x, y, z};
  }
};
using                            data_t = cuda::std::array<int, 3>;
cuda::std::extents<int, 3, 2, 2> extents{};
thrust::device_vector<data_t>    d_output(cub::detail::size(extents));
thrust::host_vector<data_t>      h_output(cub::detail::size(extents));
auto                             d_output_raw = cuda::std::span<data_t>{thrust::raw_pointer_cast(d_output.data()),
                                                                        3 * 2 * 2};
thrust::host_vector<data_t> expected = {{0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1},
                                        {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1},
                                        {2, 0, 0}, {2, 0, 1}, {2, 1, 0}, {2, 1, 1}};

cub::DeviceFor::ForEachInExtents(extents, [=] __device__ (int idx, int x, int y, int z) {
  d_output_raw[idx] = {x, y, z};
});
h_output = d_output;
REQUIRE(h_output == expected);

thrust::fill(d_output.begin(), d_output.end(), data_t{});
cub::DeviceFor::ForEachInExtents(extents, linear_store_3D{d_output_raw});

h_output = d_output;
REQUIRE(h_output == expected);

Template Parameters
  • IndexType – is an integral type that represents the extent index space (automatically deduced)

  • Extents – are the extent sizes for each rank index (automatically deduced)

  • OpType – is a function object with arity equal to the number of extents + 1 for the linear index (iteration)

Parameters
  • extents[in] Extents object that represents a multi-dimensional index space

  • op[in] Function object to apply to each linear index (iteration) and multi-dimensional coordinates

  • stream[in] CUDA stream to launch kernels within. Default stream is NULL

Returns

cudaError_t error status