cub::DeviceFor
Defined in cub/device/device_for.cuh
-
struct DeviceFor
Public Static Functions
-
template<class ShapeT, class OpT>
static inline cudaError_t Bulk(void *d_temp_storage, size_t &temp_storage_bytes, ShapeT shape, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each index in the provided shape The algorithm is similar to bulk from P2300.The return value of
op
, if any, is ignored.When
d_temp_storage
isnullptr
, no work is done and the required allocation size is returned intemp_storage_bytes
.
A Simple Example
The following code snippet demonstrates how to use Bulk to square each element in a device vector.
struct square_t { int* d_ptr; __device__ void operator()(int i) { d_ptr[i] *= d_ptr[i]; } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; square_t op{thrust::raw_pointer_cast(vec.data())}; // 1) Get temp storage size std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op); // 2) Allocate temp storage thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // 3) Perform bulk operation cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op); thrust::device_vector<int> expected = {1, 4, 9, 16};
- Template Parameters
ShapeT – is an integral type
OpT – is a model of Unary Function
- Parameters
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationshape – [in] Shape of the index space to iterate over
op – [in] Function object to apply to each index in the index space
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachN(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, first + num_items)
The return value of
op
, if any, is ignored.When
d_temp_storage
isnullptr
, no work is done and the required allocation size is returned intemp_storage_bytes
.
A Simple Example
The following code snippet demonstrates how to use ForEachN to square each element in a device vector.
struct square_ref_t { __device__ void operator()(int& i) { i *= i; } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; square_ref_t op{}; // 1) Get temp storage size std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op); // 2) Allocate temp storage thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // 3) Perform for each n operation cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op); thrust::device_vector<int> expected = {1, 4, 9, 16};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.NumItemsT – is an integral type representing the number of elements to iterate over
OpT – is a model of Unary Function
- Parameters
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationfirst – [in] The beginning of the sequence
num_items – [in] Number of elements to iterate over
op – [in] Function object to apply to each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEach(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, last)
The return value of
op
, if any, is ignored.When
d_temp_storage
isnullptr
, no work is done and the required allocation size is returned intemp_storage_bytes
.
A Simple Example
The following code snippet demonstrates how to use ForEach to square each element in a device vector.
struct square_ref_t { __device__ void operator()(int& i) { i *= i; } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; square_ref_t op{}; // 1) Get temp storage size std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op); // 2) Allocate temp storage thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // 3) Perform for each operation cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op); thrust::device_vector<int> expected = {1, 4, 9, 16};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.OpT – is a model of Unary Function
- Parameters
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationfirst – [in] The beginning of the sequence
last – [in] The end of the sequence
op – [in] Function object to apply to each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachCopyN(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, first + num_items)
. Unlike theForEachN
algorithm,ForEachCopyN
is allowed to invokeop
on copies of the elements. This relaxation allowsForEachCopyN
to vectorize loads.Allowed to invoke
op
on copies of the elementsThe return value of
op
, if any, is ignored.When
d_temp_storage
isnullptr
, no work is done and the required allocation size is returned intemp_storage_bytes
.
A Simple Example
The following code snippet demonstrates how to use ForEachCopyN to count odd elements in a device vector.
struct odd_count_t { int* d_count; __device__ void operator()(int i) { if (i % 2 == 1) { atomicAdd(d_count, 1); } } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; thrust::device_vector<int> count(1); odd_count_t op{thrust::raw_pointer_cast(count.data())}; // 1) Get temp storage size std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op); // 2) Allocate temp storage thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // 3) Perform for each n operation cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op); thrust::device_vector<int> expected = {2};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.NumItemsT – is an integral type representing the number of elements to iterate over
OpT – is a model of Unary Function
- Parameters
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationfirst – [in] The beginning of the sequence
num_items – [in] Number of elements to iterate over
op – [in] Function object to apply to a copy of each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEachCopy(void *d_temp_storage, size_t &temp_storage_bytes, RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, last)
. Unlike theForEach
algorithm,ForEachCopy
is allowed to invokeop
on copies of the elements. This relaxation allowsForEachCopy
to vectorize loads.Allowed to invoke
op
on copies of the elementsThe return value of
op
, if any, is ignored.When
d_temp_storage
isnullptr
, no work is done and the required allocation size is returned intemp_storage_bytes
.
A Simple Example
The following code snippet demonstrates how to use ForEachCopy to count odd elements in a device vector.
struct odd_count_t { int* d_count; __device__ void operator()(int i) { if (i % 2 == 1) { atomicAdd(d_count, 1); } } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; thrust::device_vector<int> count(1); odd_count_t op{thrust::raw_pointer_cast(count.data())}; // 1) Get temp storage size std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op); // 2) Allocate temp storage thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // 3) Perform for each n operation cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op); thrust::device_vector<int> expected = {2};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.OpT – is a model of Unary Function
- Parameters
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationfirst – [in] The beginning of the sequence
last – [in] The end of the sequence
op – [in] Function object to apply to a copy of each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class ShapeT, class OpT>
static inline cudaError_t Bulk(ShapeT shape, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each index in the provided shape The algorithm is similar to bulk from P2300.The return value of
op
, if any, is ignored.
A Simple Example
The following code snippet demonstrates how to use Bulk to square each element in a device vector.
struct square_t { int* d_ptr; __device__ void operator()(int i) { d_ptr[i] *= d_ptr[i]; } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; square_t op{thrust::raw_pointer_cast(vec.data())}; cub::DeviceFor::Bulk(vec.size(), op); thrust::device_vector<int> expected = {1, 4, 9, 16};
- Template Parameters
ShapeT – is an integral type
OpT – is a model of Unary Function
- Parameters
shape – [in] Shape of the index space to iterate over
op – [in] Function object to apply to each index in the index space
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, first + num_items)
The return value of
op
, if any, is ignored.
A Simple Example
The following code snippet demonstrates how to use ForEachN to square each element in a device vector.
struct square_ref_t { __device__ void operator()(int& i) { i *= i; } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; square_ref_t op{}; cub::DeviceFor::ForEachN(vec.begin(), vec.size(), op); thrust::device_vector<int> expected = {1, 4, 9, 16};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.NumItemsT – is an integral type representing the number of elements to iterate over
OpT – is a model of Unary Function
- Parameters
first – [in] The beginning of the sequence
num_items – [in] Number of elements to iterate over
op – [in] Function object to apply to each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEach(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, last)
The return value of
op
, if any, is ignored.
A Simple Example
The following code snippet demonstrates how to use ForEach to square each element in a device vector.
struct square_ref_t { __device__ void operator()(int& i) { i *= i; } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; square_ref_t op{}; cub::DeviceFor::ForEach(vec.begin(), vec.end(), op); thrust::device_vector<int> expected = {1, 4, 9, 16};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.OpT – is a model of Unary Function
- Parameters
first – [in] The beginning of the sequence
last – [in] The end of the sequence
op – [in] Function object to apply to each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class NumItemsT, class OpT>
static inline cudaError_t ForEachCopyN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, first + num_items)
. Unlike theForEachN
algorithm,ForEachCopyN
is allowed to invokeop
on copies of the elements. This relaxation allowsForEachCopyN
to vectorize loads.Allowed to invoke
op
on copies of the elementsThe return value of
op
, if any, is ignored.
A Simple Example
The following code snippet demonstrates how to use ForEachCopyN to count odd elements in a device vector.
struct odd_count_t { int* d_count; __device__ void operator()(int i) { if (i % 2 == 1) { atomicAdd(d_count, 1); } } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; thrust::device_vector<int> count(1); odd_count_t op{thrust::raw_pointer_cast(count.data())}; cub::DeviceFor::ForEachCopyN(vec.begin(), vec.size(), op); thrust::device_vector<int> expected = {2};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.NumItemsT – is an integral type representing the number of elements to iterate over
OpT – is a model of Unary Function
- Parameters
first – [in] The beginning of the sequence
num_items – [in] Number of elements to iterate over
op – [in] Function object to apply to a copy of each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<class RandomAccessIteratorT, class OpT>
static inline cudaError_t ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) Overview
Applies the function object
op
to each element in the range[first, last)
. Unlike theForEach
algorithm,ForEachCopy
is allowed to invokeop
on copies of the elements. This relaxation allowsForEachCopy
to vectorize loads.Allowed to invoke
op
on copies of the elementsThe return value of
op
, if any, is ignored.
A Simple Example
The following code snippet demonstrates how to use ForEachCopy to count odd elements in a device vector.
struct odd_count_t { int* d_count; __device__ void operator()(int i) { if (i % 2 == 1) { atomicAdd(d_count, 1); } } };
thrust::device_vector<int> vec = {1, 2, 3, 4}; thrust::device_vector<int> count(1); odd_count_t op{thrust::raw_pointer_cast(count.data())}; cub::DeviceFor::ForEachCopy(vec.begin(), vec.end(), op); thrust::device_vector<int> expected = {2};
- Template Parameters
RandomAccessIteratorT – is a model of Random Access Iterator whose value type is convertible to
op
’s argument type.OpT – is a model of Unary Function
- Parameters
first – [in] The beginning of the sequence
last – [in] The end of the sequence
op – [in] Function object to apply to a copy of each element in the range
stream – [in] CUDA stream to launch kernels within. Default stream is
0
.
-
template<typename IndexType, ::cuda::std::size_t... Extents, typename OpType>
static inline cudaError_t ForEachInExtents(void *d_temp_storage, size_t &temp_storage_bytes, const ::cuda::std::extents<IndexType, Extents...> &extents, OpType op, cudaStream_t stream = {}) Overview
Iterate through a multi-dimensional extents into
a single linear index that represents the current iteration
indices of each extent dimension
Then apply a function object to the results.
The return value of
op
, if any, is ignored.
Note:
DeviceFor::ForEachInExtents
supports integral index type up to 64-bits.A Simple Example
The following code snippet demonstrates how to use
ForEachInExtents
to tabulate a 3D array with its coordinates.struct linear_store_3D { using data_t = cuda::std::array<int, 3>; cuda::std::span<data_t> d_output_raw; __device__ void operator()(int idx, int x, int y, int z) { d_output_raw[idx] = {x, y, z}; } };
using data_t = cuda::std::array<int, 3>; cuda::std::extents<int, 3, 2, 2> extents{}; thrust::device_vector<data_t> d_output(cub::detail::size(extents)); thrust::host_vector<data_t> h_output(cub::detail::size(extents)); auto d_output_raw = cuda::std::span<data_t>{thrust::raw_pointer_cast(d_output.data()), 3 * 2 * 2}; thrust::host_vector<data_t> expected = {{0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1}, {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1}, {2, 0, 0}, {2, 0, 1}, {2, 1, 0}, {2, 1, 1}}; cub::DeviceFor::ForEachInExtents(extents, [=] __device__ (int idx, int x, int y, int z) { d_output_raw[idx] = {x, y, z}; }); h_output = d_output; REQUIRE(h_output == expected); thrust::fill(d_output.begin(), d_output.end(), data_t{}); cub::DeviceFor::ForEachInExtents(extents, linear_store_3D{d_output_raw}); h_output = d_output; REQUIRE(h_output == expected);
- Template Parameters
IndexType – is an integral type that represents the extent index space (automatically deduced)
Extents – are the extent sizes for each rank index (automatically deduced)
OpType – is a function object with arity equal to the number of extents + 1 for the linear index (iteration)
- Parameters
d_temp_storage – [in] Device-accessible allocation of temporary storage. When
nullptr
, the required allocation size is written totemp_storage_bytes
and no work is done.temp_storage_bytes – [inout] Reference to size in bytes of
d_temp_storage
allocationextents – [in] Extents object that represents a multi-dimensional index space
op – [in] Function object to apply to each linear index (iteration) and multi-dimensional coordinates
stream – [in] CUDA stream to launch kernels within. Default stream is
NULL
- Returns
cudaError_t error status
-
template<typename IndexType, ::cuda::std::size_t... Extents, typename OpType>
static inline cudaError_t ForEachInExtents(const ::cuda::std::extents<IndexType, Extents...> &extents, OpType op, cudaStream_t stream = {}) Overview
Iterate through a multi-dimensional extents producing
a single linear index that represents the current iteration
list of indices containing the coordinates for each extent dimension
Then apply a function object to each tuple of linear index and multidimensional coordinate list.
The return value of
op
, if any, is ignored.
Note:
DeviceFor::ForEachInExtents
supports integral index type up to 64-bits.A Simple Example
The following code snippet demonstrates how to use
ForEachInExtents
to tabulate a 3D array with its coordinates.struct linear_store_3D { using data_t = cuda::std::array<int, 3>; cuda::std::span<data_t> d_output_raw; __device__ void operator()(int idx, int x, int y, int z) { d_output_raw[idx] = {x, y, z}; } };
using data_t = cuda::std::array<int, 3>; cuda::std::extents<int, 3, 2, 2> extents{}; thrust::device_vector<data_t> d_output(cub::detail::size(extents)); thrust::host_vector<data_t> h_output(cub::detail::size(extents)); auto d_output_raw = cuda::std::span<data_t>{thrust::raw_pointer_cast(d_output.data()), 3 * 2 * 2}; thrust::host_vector<data_t> expected = {{0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1}, {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1}, {2, 0, 0}, {2, 0, 1}, {2, 1, 0}, {2, 1, 1}}; cub::DeviceFor::ForEachInExtents(extents, [=] __device__ (int idx, int x, int y, int z) { d_output_raw[idx] = {x, y, z}; }); h_output = d_output; REQUIRE(h_output == expected); thrust::fill(d_output.begin(), d_output.end(), data_t{}); cub::DeviceFor::ForEachInExtents(extents, linear_store_3D{d_output_raw}); h_output = d_output; REQUIRE(h_output == expected);
- Template Parameters
IndexType – is an integral type that represents the extent index space (automatically deduced)
Extents – are the extent sizes for each rank index (automatically deduced)
OpType – is a function object with arity equal to the number of extents + 1 for the linear index (iteration)
- Parameters
extents – [in] Extents object that represents a multi-dimensional index space
op – [in] Function object to apply to each linear index (iteration) and multi-dimensional coordinates
stream – [in] CUDA stream to launch kernels within. Default stream is
NULL
- Returns
cudaError_t error status
-
template<class ShapeT, class OpT>