cuda::experimental::stf::algorithm#

class algorithm#

Algorithms are a mechanism to implement reusable task sequences implemented by the means of CUDA graphs nested within a task.

The underlying CUDA graphs are cached so that they are temptatively reused when the algorithm is run again. Nested algorithms are internally implemented as child graphs.

Public Functions

inline algorithm(::std::string _symbol = "algorithm")#
template<typename Fun, typename parent_ctx_t, typename ...Args>
inline void run_in_graph(
Fun &&fun,
parent_ctx_t &parent_ctx,
cudaGraph_t graph,
const Args&... args,
)#
template<typename context_t, typename Fun, typename ...Deps>
inline void run_inline(
Fun &&fun,
context_t &ctx,
const task_dep<Deps>&... deps,
)#
template<typename Fun, typename ...Deps>
inline void run_as_task(
Fun &&fun,
stream_ctx &ctx,
task_dep<Deps>... deps,
)#
template<typename Fun, typename ...Deps>
inline void run_as_task(
Fun &&fun,
graph_ctx &ctx,
task_dep<Deps>... deps,
)#
template<typename Fun, typename ...Deps>
inline void run_as_task(
Fun &&fun,
context &ctx,
task_dep<Deps>... deps,
)#

Executes fun within a task that takes a pack of dependencies.

As an alternative, the run_as_task_dynamic may take a variable number of dependencies

template<typename Fun>
inline void run_as_task_dynamic(
Fun &&fun,
stream_ctx &ctx,
task_dep_vector_untyped deps,
)#
template<typename Fun>
inline void run_as_task_dynamic(
Fun&&,
graph_ctx&,
const ::std::vector<task_dep_untyped>&,
)#
template<typename Fun>
inline void run_as_task_dynamic(
Fun &&fun,
context &ctx,
task_dep_vector_untyped deps,
)#

Executes fun within a task that takes a vector of untyped dependencies.

This is an alternative for run_as_task which may take a variable number of dependencies

template<typename context_t, typename ...Deps>
inline runner_impl<context_t, Deps...> runner(
context_t &ctx,
task_dep<Deps>... deps,
)#

Helper to use algorithm using the ->* idiom instead of passing the implementation as an argument of run_as_task.

example: algorithm alg; alg.runner(ctx, lX.read(), lY.rw())->[](context inner_ctx, logical_data<slice<double>> X, logical_data<slice<double>> Y) { inner_ctx.parallel_for(Y.shape(), X.rw(), Y.rw())->

[]__device__(size_t i, auto

x, auto y) { y(i) = 2.0*x(i); }; };

Which is equivalent to: auto fn = [](context inner_ctx, logical_data<slice<double>> X, logical_data<slice<double>> Y) { inner_ctx.parallel_for(Y.shape(), X.rw(), Y.rw())->*[]__device__(size_t i, auto x, auto y) { y(i) = 2.0*x(i); } };

algorithm alg; alg.run_as_task(fn, ctx, lX.read(), lY.rw());

inline auto setup_allocator(graph_ctx &gctx, cudaStream_t stream)#
template<typename Fun, typename parent_ctx_t, typename ...Args>
inline void run(
Fun &&fun,
parent_ctx_t &parent_ctx,
cudaStream_t stream,
const Args&... args,
)#
template<typename Fun, typename parent_ctx_t, typename task_t>
inline void run_dynamic(
Fun &&fun,
parent_ctx_t &parent_ctx,
cudaStream_t stream,
task_t &t,
)#