42 template <
typename Func,
int Rank,
int RankRemaining>
52 for (
int i = Rank - RankRemaining; i < Rank; ++i) {
56 coord[Rank - 1 - RankRemaining] = index / product;
57 int64_t remaining = index % product;
64 template <
typename Func,
int Rank>
71 coord[Rank - 1] = index;
84 template <
typename Func,
int Rank,
typename Params>
89 int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
90 int64_t max_index = 1;
93 for (
int i = 0; i < Rank; ++i) {
98 while (index < max_index) {
102 index += blockDim.x * gridDim.x;
109 template <
typename Func,
int Rank,
typename Params>
114 int64_t index = threadIdx.x + blockIdx.x * blockDim.x + start;
120 for (
int i = 0; i < Rank; ++i) {
130 template <
typename Element,
typename Func>
134 typename Func::Params params) {
138 size_t index = threadIdx.x + blockIdx.x * blockDim.x;
140 for (; index < capacity; index += blockDim.x * gridDim.x) {
Definition: aligned_buffer.h:35
A Coord is a coordinate of arbitrary rank into a tensor or matrix.
__inline__ __device__ TensorForEachHelper(Func &func, Coord< Rank > const &size, Coord< Rank > &coord, int64_t index)
Constructor for fastest changing rank.
Definition: device/kernel/tensor_foreach.h:69
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
__global__ void BlockForEach(Element *ptr, size_t capacity, typename Func::Params params)
Definition: device/kernel/tensor_foreach.h:131
#define CUTLASS_PRAGMA_NO_UNROLL
Definition: cutlass.h:111
Statically-sized array specifying Coords within a tensor.
Definition: coord.h:43
__inline__ __device__ TensorForEachHelper(Func &func, Coord< Rank > const &size, Coord< Rank > &coord, int64_t index)
Constructor for general rank.
Definition: device/kernel/tensor_foreach.h:47
__global__ void TensorDiagonalForEach(Coord< Rank > size, Params params, int start, int end)
Kernel calls a functor for each element along a tensor's diagonal.
Definition: device/kernel/tensor_foreach.h:110
__global__ void TensorForEach(Coord< Rank > size, Params params=Params())
Kernel calls a functor for each element in a tensor's index space.
Definition: device/kernel/tensor_foreach.h:85
Helper to perform for-each operation.
Definition: device/kernel/tensor_foreach.h:43
Basic include for CUTLASS.