cub::DeviceFor
Defined in cub/device/device_for.cuh
-
struct DeviceFor
Public Static Functions
-
template<class ShapeT, class OpT>
static inline cudaError_t Bulk(void *d_temp_storage, size_t &temp_storage_bytes, ShapeT shape, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each index in the provided shape The algorithm is similar to bulk from P2300.The return value of
op
, if any, is ignored.When
d_temp_storage
isnullptr
, no work is done and the required allocation size is returned intemp_storage_bytes
.
A Simple Example
The following code snippet demonstrates how to use Bulk to square each element in a device vector.
struct square_t { int* d_ptr; __device__ void operator()(int i) { d_ptr[i] *= d_ptr[i]; } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; square_t op{thrust::raw_pointer_cast(vec.data())}; // 1) Get temp storage size std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op); // 2) Allocate temp storage thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // 3) Perform bulk operation cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op); thrust::device_vector<int> expected = {1, 4, 9, 16};
- Template Parameters
ShapeT – is an integral type
OpT – is a model of Unary Function
- Parameters
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationshape – [in] Shape of the index space to iterate over
op – [in] Function object to apply to each index in the index space
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachN(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, first + num_items)
The return value of
op
, if any, is ignored.When
d_temp_storage
isnullptr
, no work is done and the required allocation size is returned intemp_storage_bytes
.
A Simple Example
The following code snippet demonstrates how to use ForEachN to square each element in a device vector.
struct square_ref_t { __device__ void operator()(int& i) { i *= i; } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; square_ref_t op{}; // 1) Get temp storage size std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op); // 2) Allocate temp storage thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // 3) Perform for each n operation cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op); thrust::device_vector<int> expected = {1, 4, 9, 16};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.NumItemsT – is an integral type representing the number of elements to iterate over
OpT – is a model of Unary Function
- Parameters
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationfirst – [in] The beginning of the sequence
num_items – [in] Number of elements to iterate over
op – [in] Function object to apply to each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEach(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, last)
The return value of
op
, if any, is ignored.When
d_temp_storage
isnullptr
, no work is done and the required allocation size is returned intemp_storage_bytes
.
A Simple Example
The following code snippet demonstrates how to use ForEach to square each element in a device vector.
struct square_ref_t { __device__ void operator()(int& i) { i *= i; } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; square_ref_t op{}; // 1) Get temp storage size std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op); // 2) Allocate temp storage thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // 3) Perform for each operation cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op); thrust::device_vector<int> expected = {1, 4, 9, 16};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.OpT – is a model of Unary Function
- Parameters
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationfirst – [in] The beginning of the sequence
last – [in] The end of the sequence
op – [in] Function object to apply to each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachCopyN(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, first + num_items)
. Unlike theForEachN
algorithm,ForEachCopyN
is allowed to invokeop
on copies of the elements. This relaxation allowsForEachCopyN
to vectorize loads.Allowed to invoke
op
on copies of the elementsThe return value of
op
, if any, is ignored.When
d_temp_storage
isnullptr
, no work is done and the required allocation size is returned intemp_storage_bytes
.
A Simple Example
The following code snippet demonstrates how to use ForEachCopyN to count odd elements in a device vector.
struct odd_count_t { int* d_count; __device__ void operator()(int i) { if (i % 2 == 1) { atomicAdd(d_count, 1); } } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; thrust::device_vector<int> count(1); odd_count_t op{thrust::raw_pointer_cast(count.data())}; // 1) Get temp storage size std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op); // 2) Allocate temp storage thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // 3) Perform for each n operation cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op); thrust::device_vector<int> expected = {2};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.NumItemsT – is an integral type representing the number of elements to iterate over
OpT – is a model of Unary Function
- Parameters
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationfirst – [in] The beginning of the sequence
num_items – [in] Number of elements to iterate over
op – [in] Function object to apply to a copy of each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEachCopy(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, last)
. Unlike theForEach
algorithm,ForEachCopy
is allowed to invokeop
on copies of the elements. This relaxation allowsForEachCopy
to vectorize loads.Allowed to invoke
op
on copies of the elementsThe return value of
op
, if any, is ignored.When
d_temp_storage
isnullptr
, no work is done and the required allocation size is returned intemp_storage_bytes
.
A Simple Example
The following code snippet demonstrates how to use ForEachCopy to count odd elements in a device vector.
struct odd_count_t { int* d_count; __device__ void operator()(int i) { if (i % 2 == 1) { atomicAdd(d_count, 1); } } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; thrust::device_vector<int> count(1); odd_count_t op{thrust::raw_pointer_cast(count.data())}; // 1) Get temp storage size std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op); // 2) Allocate temp storage thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // 3) Perform for each n operation cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op); thrust::device_vector<int> expected = {2};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.OpT – is a model of Unary Function
- Parameters
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationfirst – [in] The beginning of the sequence
last – [in] The end of the sequence
op – [in] Function object to apply to a copy of each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class ShapeT, class OpT>
static inline cudaError_t Bulk(ShapeT shape, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each index in the provided shape The algorithm is similar to bulk from P2300.The return value of
op
, if any, is ignored.
A Simple Example
The following code snippet demonstrates how to use Bulk to square each element in a device vector.
struct square_t { int* d_ptr; __device__ void operator()(int i) { d_ptr[i] *= d_ptr[i]; } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; square_t op{thrust::raw_pointer_cast(vec.data())}; cub::DeviceFor::Bulk(vec.size(), op); thrust::device_vector<int> expected = {1, 4, 9, 16};
- Template Parameters
ShapeT – is an integral type
OpT – is a model of Unary Function
- Parameters
shape – [in] Shape of the index space to iterate over
op – [in] Function object to apply to each index in the index space
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, first + num_items)
The return value of
op
, if any, is ignored.
A Simple Example
The following code snippet demonstrates how to use ForEachN to square each element in a device vector.
struct square_ref_t { __device__ void operator()(int& i) { i *= i; } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; square_ref_t op{}; cub::DeviceFor::ForEachN(vec.begin(), vec.size(), op); thrust::device_vector<int> expected = {1, 4, 9, 16};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.NumItemsT – is an integral type representing the number of elements to iterate over
OpT – is a model of Unary Function
- Parameters
first – [in] The beginning of the sequence
num_items – [in] Number of elements to iterate over
op – [in] Function object to apply to each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEach(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, last)
The return value of
op
, if any, is ignored.
A Simple Example
The following code snippet demonstrates how to use ForEach to square each element in a device vector.
struct square_ref_t { __device__ void operator()(int& i) { i *= i; } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; square_ref_t op{}; cub::DeviceFor::ForEach(vec.begin(), vec.end(), op); thrust::device_vector<int> expected = {1, 4, 9, 16};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.OpT – is a model of Unary Function
- Parameters
first – [in] The beginning of the sequence
last – [in] The end of the sequence
op – [in] Function object to apply to each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachCopyN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, first + num_items)
. Unlike theForEachN
algorithm,ForEachCopyN
is allowed to invokeop
on copies of the elements. This relaxation allowsForEachCopyN
to vectorize loads.Allowed to invoke
op
on copies of the elementsThe return value of
op
, if any, is ignored.
A Simple Example
The following code snippet demonstrates how to use ForEachCopyN to count odd elements in a device vector.
struct odd_count_t { int* d_count; __device__ void operator()(int i) { if (i % 2 == 1) { atomicAdd(d_count, 1); } } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; thrust::device_vector<int> count(1); odd_count_t op{thrust::raw_pointer_cast(count.data())}; cub::DeviceFor::ForEachCopyN(vec.begin(), vec.size(), op); thrust::device_vector<int> expected = {2};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.NumItemsT – is an integral type representing the number of elements to iterate over
OpT – is a model of Unary Function
- Parameters
first – [in] The beginning of the sequence
num_items – [in] Number of elements to iterate over
op – [in] Function object to apply to a copy of each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, last)
. Unlike theForEach
algorithm,ForEachCopy
is allowed to invokeop
on copies of the elements. This relaxation allowsForEachCopy
to vectorize loads.Allowed to invoke
op
on copies of the elementsThe return value of
op
, if any, is ignored.
A Simple Example
The following code snippet demonstrates how to use ForEachCopy to count odd elements in a device vector.
struct odd_count_t { int* d_count; __device__ void operator()(int i) { if (i % 2 == 1) { atomicAdd(d_count, 1); } } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; thrust::device_vector<int> count(1); odd_count_t op{thrust::raw_pointer_cast(count.data())}; cub::DeviceFor::ForEachCopy(vec.begin(), vec.end(), op); thrust::device_vector<int> expected = {2};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.OpT – is a model of Unary Function
- Parameters
first – [in] The beginning of the sequence
last – [in] The end of the sequence
op – [in] Function object to apply to a copy of each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class ShapeT, class OpT>