cuda::apply_access_property
template <class ShapeT>
__host__ __device__
void apply_access_property(void const volatile* ptr, ShapeT shape, cuda::access_property::persisting) noexcept;
template <class ShapeT>
__host__ __device__
void apply_access_property(void const volatile* ptr, ShapeT shape, cuda::access_property::normal) noexcept;
Mandates: ShapeT is either std::size_t or cuda::aligned_size_t.
Preconditions: ptr
points to a valid allocation for shape
in the global memory address space.
Effects: no effects.
Hint: to prefetch shape
bytes of memory starting at ptr
while applying a property. Two properties are supported:
Note: in Preconditions “valid allocation for shape
means
that:
if
ShapeT
isaligned_size_t<N>(sz)
thenptr
is aligned to anN
-bytes alignment boundary, andfor all offsets
i
in the extent ofshape
, i.e.,i
in[0, shape)
then the expression*(ptr + i)
does not exhibit undefined behavior.
Note: currently apply_access_property
is ignored by nvcc and nvc++ on the host.
Example
Given three input and output vectors x
, y
, and z
, and two arrays of coefficients a
and b
,
all of length N
:
size_t N;
int* x, *y, *z;
int* a, *b;
the grid-strided kernel:
__global__ void update(int* const x, int const* const a, int const* const b, size_t N) {
auto g = cooperative_groups::this_grid();
for (int idx = g.thread_rank(); idx < N; idx += g.size()) {
x[idx] = a[idx] * x[idx] + b[idx];
}
}
updates x
, y
, and z
as follows:
update<<<grid, block>>>(x, a, b, N);
update<<<grid, block>>>(y, a, b, N);
update<<<grid, block>>>(z, a, b, N);
The elements of a
and b
are used in all kernels. For certain values of N
, this may prevent parts of a
and b
from being evicted from the L2 cache, avoiding reloading these from memory in the subsequent update
kernel.
With cuda::access_property and
cuda::apply_access_property, we can
write kernels that specify that a
and b
are accessed more often than (pin
) and as often as (unpin
) other data:
__global__ void pin(int* a, int* b, size_t N) {
auto g = cooperative_groups::this_grid();
for (int idx = g.thread_rank(); idx < N; idx += g.size()) {
cuda::apply_access_property(a + idx, sizeof(int), cuda::access_property::persisting{});
cuda::apply_access_property(b + idx, sizeof(int), cuda::access_property::persisting{});
}
}
__global__ void unpin(int* a, int* b, size_t N) {
auto g = cooperative_groups::this_grid();
for (int idx = g.thread_rank(); idx < N; idx += g.size()) {
cuda::apply_access_property(a + idx, sizeof(int), cuda::access_property::normal{});
cuda::apply_access_property(b + idx, sizeof(int), cuda::access_property::normal{});
}
}
which we can launch before and after the update
kernels:
pin<<<grid, block>>>(a, b, N);
update<<<grid, block>>>(x, a, b, N);
update<<<grid, block>>>(y, a, b, N);
update<<<grid, block>>>(z, a, b, N);
unpin<<<grid, block>>>(a, b, N);
This does not require modifying the update
kernel, and for certain values of N
prevents a
and b
from having to be re-loaded from memory.
The pin
and unpin
kernels can be fused into the kernels for the x
and z
updates by modifying these kernels.