cuda::experimental::stf::algorithm

Defined in include/cuda/experimental/stf.cuh

class algorithm

Algorithms are a mechanism to implement reusable task sequences implemented by the means of CUDA graphs nested within a task.

The underlying CUDA graphs are cached so that they are temptatively reused when the algorithm is run again. Nested algorithms are internally implemented as child graphs.

Public Functions

inline algorithm(::std::string _symbol = "algorithm")
template<typename Fun, typename parent_ctx_t, typename ...Args>
inline void run_in_graph(Fun fun, parent_ctx_t &parent_ctx, cudaGraph_t graph, Args... args)
template<typename context_t, typename Fun, typename ...Deps>
inline void run_inline(Fun fun, context_t &ctx, task_dep<Deps>... deps)
template<typename Fun, typename ...Deps>
inline void run_as_task(Fun fun, stream_ctx &ctx, task_dep<Deps>... deps)
template<typename Fun, typename ...Deps>
inline void run_as_task(Fun fun, graph_ctx &ctx, task_dep<Deps>... deps)
template<typename Fun, typename ...Deps>
inline void run_as_task(Fun fun, context &ctx, task_dep<Deps>... deps)

Executes fun within a task that takes a pack of dependencies.

As an alternative, the run_as_task_dynamic may take a variable number of dependencies

template<typename Fun>
inline void run_as_task_dynamic(Fun fun, stream_ctx &ctx, const ::std::vector<task_dep_untyped> &deps)
template<typename Fun>
inline void run_as_task_dynamic(Fun, graph_ctx&, const ::std::vector<task_dep_untyped>&)
template<typename Fun>
inline void run_as_task_dynamic(Fun fun, context &ctx, const ::std::vector<task_dep_untyped> &deps)

Executes fun within a task that takes a vector of untyped dependencies.

This is an alternative for run_as_task which may take a variable number of dependencies

template<typename context_t, typename ...Deps>
inline runner_impl<context_t, Deps...> runner(context_t &ctx, task_dep<Deps>... deps)

Helper to use algorithm using the ->* idiom instead of passing the implementation as an argument of run_as_task.

example: algorithm alg; alg.runner(ctx, lX.read(), lY.rw())->*[](context inner_ctx, logical_data<slice<double>> X, logical_data<slice<double>> Y) { inner_ctx.parallel_for(Y.shape(), X.rw(), Y.rw())->*[]__device__(size_t i, auto x, auto y) { y(i) = 2.0*x(i); }; };

Which is equivalent to: auto fn = [](context inner_ctx, logical_data<slice<double>> X, logical_data<slice<double>> Y) { inner_ctx.parallel_for(Y.shape(), X.rw(), Y.rw())->*[]__device__(size_t i, auto x, auto y) { y(i) = 2.0*x(i); } };

algorithm alg; alg.run_as_task(fn, ctx, lX.read(), lY.rw());

template<typename Fun, typename parent_ctx_t, typename ...Args>
inline void run(Fun fun, parent_ctx_t &parent_ctx, cudaStream_t stream, Args... args)
template<typename Fun, typename parent_ctx_t, typename task_t>
inline void run_dynamic(Fun fun, parent_ctx_t &parent_ctx, cudaStream_t stream, task_t &t)