cuda::experimental::stf::algorithm#

class algorithm#

Algorithms are a mechanism to implement reusable task sequences implemented by the means of CUDA graphs nested within a task.

The underlying CUDA graphs are cached so that they are temptatively reused when the algorithm is run again. Nested algorithms are internally implemented as child graphs.

Public Functions

inline algorithm(::std::string _symbol = "algorithm")#

template<typename Fun, typename parent_ctx_t, typename ...Args> inline void run_in_graph( Fun &&fun, parent_ctx_t &parent_ctx, cudaGraph_t graph, const Args&... args, )#

template<typename context_t, typename Fun, typename ...Deps> inline void run_inline( Fun &&fun, context_t &ctx, const task_dep<Deps>&... deps, )#

template<typename Fun, typename ...Deps> inline void run_as_task( Fun &&fun, stream_ctx &ctx, task_dep<Deps>... deps, )#

template<typename Fun, typename ...Deps> inline void run_as_task( Fun &&fun, graph_ctx &ctx, task_dep<Deps>... deps, )#

template<typename Fun, typename ...Deps> inline void run_as_task( Fun &&fun, context &ctx, task_dep<Deps>... deps, )#

Executes fun within a task that takes a pack of dependencies.

As an alternative, the run_as_task_dynamic may take a variable number of dependencies

template<typename Fun> inline void run_as_task_dynamic( Fun &&fun, stream_ctx &ctx, task_dep_vector_untyped deps, )#

template<typename Fun> inline void run_as_task_dynamic( Fun&&, graph_ctx&, const ::std::vector<task_dep_untyped>&, )#

template<typename Fun> inline void run_as_task_dynamic( Fun &&fun, context &ctx, task_dep_vector_untyped deps, )#

Executes fun within a task that takes a vector of untyped dependencies.

This is an alternative for run_as_task which may take a variable number of dependencies

template<typename context_t, typename ...Deps> inline runner_impl<context_t, Deps...> runner( context_t &ctx, task_dep<Deps>... deps, )#

Helper to use algorithm using the ->* idiom instead of passing the implementation as an argument of run_as_task.

example: algorithm alg; alg.runner(ctx, lX.read(), lY.rw())->[](context inner_ctx, logical_data<slice<double>> X, logical_data<slice<double>> Y) { inner_ctx.parallel_for(Y.shape(), X.rw(), Y.rw())->

[]__device__(size_t i, auto

x, auto y) { y(i) = 2.0*x(i); }; };

Which is equivalent to: auto fn = [](context inner_ctx, logical_data<slice<double>> X, logical_data<slice<double>> Y) { inner_ctx.parallel_for(Y.shape(), X.rw(), Y.rw())->*[]__device__(size_t i, auto x, auto y) { y(i) = 2.0*x(i); } };

algorithm alg; alg.run_as_task(fn, ctx, lX.read(), lY.rw());

inline auto setup_allocator(graph_ctx &gctx, cudaStream_t stream)#

template<typename Fun, typename parent_ctx_t, typename ...Args> inline void run( Fun &&fun, parent_ctx_t &parent_ctx, cudaStream_t stream, const Args&... args, )#

template<typename Fun, typename parent_ctx_t, typename task_t> inline void run_dynamic( Fun &&fun, parent_ctx_t &parent_ctx, cudaStream_t stream, task_t &t, )#