38 template <
typename Func,
int Rank,
typename Params>
44 if (!grid_size || !block_size) {
47 cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
50 reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
52 if (result != cudaSuccess) {
53 throw std::runtime_error(
"Failed to query occupancy.");
58 block_size = (block_size < 128 ? block_size : 128);
61 dim3 grid(grid_size, 1, 1);
62 dim3 block(block_size, 1, 1);
64 kernel::TensorForEach<Func, Rank, Params><<< grid, block >>>(size, params);
71 template <
typename Func,
int Rank,
typename Params>
81 dim3 block(block_size, 1, 1);
82 dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
84 kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block >>>(size, params, start, end);
91 template <
typename Element,
typename Func>
98 typename Func::Params params =
typename Func::Params(),
100 int block_size = 0) {
102 if (!grid_size || !block_size) {
105 cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
108 reinterpret_cast<void const *>(kernel::BlockForEach<Element, Func>));
110 if (result != cudaSuccess) {
111 throw std::runtime_error(
"Failed to query occupancy.");
116 block_size = (block_size < 128 ? block_size : 128);
119 dim3 grid(grid_size, 1, 1);
120 dim3 block(block_size, 1, 1);
122 kernel::BlockForEach<Element, Func><<< grid, block >>>(ptr, capacity, params);
Definition: aligned_buffer.h:35
TensorDiagonalForEach(Coord< Rank > size, Params params=Params(), int start=0, int end=-1, int block_size=128)
Constructor performs the operation.
Definition: device/tensor_foreach.h:75
TensorForEach(Coord< Rank > size, Params params=Params(), int grid_size=0, int block_size=0)
Constructor performs the operation.
Definition: device/tensor_foreach.h:42
Launches a kernel calling a functor for each element along a tensor's diagonal.
Definition: device/tensor_foreach.h:72
BlockForEach(Element *ptr, size_t capacity, typename Func::Params params=typename Func::Params(), int grid_size=0, int block_size=0)
Constructor performs the operation.
Definition: device/tensor_foreach.h:95
Launches a kernel calling a functor for each element in a tensor's index space.
Definition: device/tensor_foreach.h:39
Statically-sized array specifying Coords within a tensor.
Definition: coord.h:43
Definition: device/tensor_foreach.h:92
Basic include for CUTLASS.