cub::DeviceFor

Defined in cub/device/device_for.cuh

struct DeviceFor

Public Static Functions

template<class ShapeT, class OpT>
static inline cudaError_t Bulk(void *d_temp_storage, size_t &temp_storage_bytes, ShapeT shape, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each index in the provided shape The algorithm is similar to bulk from P2300.

  • The return value of op, if any, is ignored.

  • When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use Bulk to square each element in a device vector.

struct square_t
{
  int* d_ptr;

  __device__ void operator()(int i)
  {
    d_ptr[i] *= d_ptr[i];
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
square_t op{thrust::raw_pointer_cast(vec.data())};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);

// 2) Allocate temp storage
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform bulk operation
cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);

thrust::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters
Parameters
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • shape[in] Shape of the index space to iterate over

  • op[in] Function object to apply to each index in the index space

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachN(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, first + num_items)

  • The return value of op, if any, is ignored.

  • When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use ForEachN to square each element in a device vector.

struct square_ref_t
{
  __device__ void operator()(int& i)
  {
    i *= i;
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
square_ref_t op{};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);

// 2) Allocate temp storage
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform for each n operation
cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);

thrust::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • NumItemsT – is an integral type representing the number of elements to iterate over

  • OpT – is a model of Unary Function

Parameters
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • first[in] The beginning of the sequence

  • num_items[in] Number of elements to iterate over

  • op[in] Function object to apply to each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEach(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, last)

  • The return value of op, if any, is ignored.

  • When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use ForEach to square each element in a device vector.

struct square_ref_t
{
  __device__ void operator()(int& i)
  {
    i *= i;
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
square_ref_t op{};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);

// 2) Allocate temp storage
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform for each operation
cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);

thrust::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • OpT – is a model of Unary Function

Parameters
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • first[in] The beginning of the sequence

  • last[in] The end of the sequence

  • op[in] Function object to apply to each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachCopyN(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, first + num_items). Unlike the ForEachN algorithm, ForEachCopyN is allowed to invoke op on copies of the elements. This relaxation allows ForEachCopyN to vectorize loads.

  • Allowed to invoke op on copies of the elements

  • The return value of op, if any, is ignored.

  • When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use ForEachCopyN to count odd elements in a device vector.

struct odd_count_t
{
  int* d_count;

  __device__ void operator()(int i)
  {
    if (i % 2 == 1)
    {
      atomicAdd(d_count, 1);
    }
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
thrust::device_vector<int> count(1);
odd_count_t op{thrust::raw_pointer_cast(count.data())};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);

// 2) Allocate temp storage
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform for each n operation
cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);

thrust::device_vector<int> expected = {2};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • NumItemsT – is an integral type representing the number of elements to iterate over

  • OpT – is a model of Unary Function

Parameters
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • first[in] The beginning of the sequence

  • num_items[in] Number of elements to iterate over

  • op[in] Function object to apply to a copy of each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEachCopy(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, last). Unlike the ForEach algorithm, ForEachCopy is allowed to invoke op on copies of the elements. This relaxation allows ForEachCopy to vectorize loads.

  • Allowed to invoke op on copies of the elements

  • The return value of op, if any, is ignored.

  • When d_temp_storage is nullptr, no work is done and the required allocation size is returned in temp_storage_bytes.

A Simple Example

The following code snippet demonstrates how to use ForEachCopy to count odd elements in a device vector.

struct odd_count_t
{
  int* d_count;

  __device__ void operator()(int i)
  {
    if (i % 2 == 1)
    {
      atomicAdd(d_count, 1);
    }
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
thrust::device_vector<int> count(1);
odd_count_t op{thrust::raw_pointer_cast(count.data())};

// 1) Get temp storage size
std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);

// 2) Allocate temp storage
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

// 3) Perform for each n operation
cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);

thrust::device_vector<int> expected = {2};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • OpT – is a model of Unary Function

Parameters
  • d_temp_storage[in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to temp_storage_bytes and no work is done.

  • temp_storage_bytes[inout] Reference to size in bytes of d_temp_storage allocation

  • first[in] The beginning of the sequence

  • last[in] The end of the sequence

  • op[in] Function object to apply to a copy of each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class ShapeT, class OpT>
static inline cudaError_t Bulk(ShapeT shape, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each index in the provided shape The algorithm is similar to bulk from P2300.

  • The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use Bulk to square each element in a device vector.

struct square_t
{
  int* d_ptr;

  __device__ void operator()(int i)
  {
    d_ptr[i] *= d_ptr[i];
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
square_t op{thrust::raw_pointer_cast(vec.data())};

cub::DeviceFor::Bulk(vec.size(), op);

thrust::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters
Parameters
  • shape[in] Shape of the index space to iterate over

  • op[in] Function object to apply to each index in the index space

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, first + num_items)

  • The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use ForEachN to square each element in a device vector.

struct square_ref_t
{
  __device__ void operator()(int& i)
  {
    i *= i;
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
square_ref_t op{};

cub::DeviceFor::ForEachN(vec.begin(), vec.size(), op);

thrust::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • NumItemsT – is an integral type representing the number of elements to iterate over

  • OpT – is a model of Unary Function

Parameters
  • first[in] The beginning of the sequence

  • num_items[in] Number of elements to iterate over

  • op[in] Function object to apply to each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEach(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, last)

  • The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use ForEach to square each element in a device vector.

struct square_ref_t
{
  __device__ void operator()(int& i)
  {
    i *= i;
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
square_ref_t op{};

cub::DeviceFor::ForEach(vec.begin(), vec.end(), op);

thrust::device_vector<int> expected = {1, 4, 9, 16};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • OpT – is a model of Unary Function

Parameters
  • first[in] The beginning of the sequence

  • last[in] The end of the sequence

  • op[in] Function object to apply to each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachCopyN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, first + num_items). Unlike the ForEachN algorithm, ForEachCopyN is allowed to invoke op on copies of the elements. This relaxation allows ForEachCopyN to vectorize loads.

  • Allowed to invoke op on copies of the elements

  • The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use ForEachCopyN to count odd elements in a device vector.

struct odd_count_t
{
  int* d_count;

  __device__ void operator()(int i)
  {
    if (i % 2 == 1)
    {
      atomicAdd(d_count, 1);
    }
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
thrust::device_vector<int> count(1);
odd_count_t op{thrust::raw_pointer_cast(count.data())};

cub::DeviceFor::ForEachCopyN(vec.begin(), vec.size(), op);

thrust::device_vector<int> expected = {2};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • NumItemsT – is an integral type representing the number of elements to iterate over

  • OpT – is a model of Unary Function

Parameters
  • first[in] The beginning of the sequence

  • num_items[in] Number of elements to iterate over

  • op[in] Function object to apply to a copy of each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.

template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})

Overview

Applies the function object op to each element in the range [first, last). Unlike the ForEach algorithm, ForEachCopy is allowed to invoke op on copies of the elements. This relaxation allows ForEachCopy to vectorize loads.

  • Allowed to invoke op on copies of the elements

  • The return value of op, if any, is ignored.

A Simple Example

The following code snippet demonstrates how to use ForEachCopy to count odd elements in a device vector.

struct odd_count_t
{
  int* d_count;

  __device__ void operator()(int i)
  {
    if (i % 2 == 1)
    {
      atomicAdd(d_count, 1);
    }
  }
};
thrust::device_vector<int> vec = {1, 2, 3, 4};
thrust::device_vector<int> count(1);
odd_count_t op{thrust::raw_pointer_cast(count.data())};

cub::DeviceFor::ForEachCopy(vec.begin(), vec.end(), op);

thrust::device_vector<int> expected = {2};

Template Parameters
  • RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to op’s argument type.

  • OpT – is a model of Unary Function

Parameters
  • first[in] The beginning of the sequence

  • last[in] The end of the sequence

  • op[in] Function object to apply to a copy of each element in the range

  • stream[in] CUDA stream to launch kernels within. Default stream is 0.