cuda::experimental::stf::algorithm
Defined in include/cuda/experimental/stf.cuh
-
class algorithm
Algorithms are a mechanism to implement reusable task sequences implemented by the means of CUDA graphs nested within a task.
The underlying CUDA graphs are cached so that they are temptatively reused when the algorithm is run again. Nested algorithms are internally implemented as child graphs.
Public Functions
-
inline algorithm(::std::string _symbol = "algorithm")
-
template<typename Fun, typename parent_ctx_t, typename ...Args>
inline void run_in_graph(Fun fun, parent_ctx_t &parent_ctx, cudaGraph_t graph, Args... args)
-
template<typename context_t, typename Fun, typename ...Deps>
inline void run_inline(Fun fun, context_t &ctx, task_dep<Deps>... deps)
-
template<typename Fun, typename ...Deps>
inline void run_as_task(Fun fun, stream_ctx &ctx, task_dep<Deps>... deps)
-
template<typename Fun, typename ...Deps>
inline void run_as_task(Fun fun, graph_ctx &ctx, task_dep<Deps>... deps)
-
template<typename Fun, typename ...Deps>
inline void run_as_task(Fun fun, context &ctx, task_dep<Deps>... deps) Executes
fun
within a task that takes a pack of dependencies.As an alternative, the run_as_task_dynamic may take a variable number of dependencies
-
template<typename Fun>
inline void run_as_task_dynamic(Fun fun, stream_ctx &ctx, const ::std::vector<task_dep_untyped> &deps)
-
template<typename Fun>
inline void run_as_task_dynamic(Fun, graph_ctx&, const ::std::vector<task_dep_untyped>&)
-
template<typename Fun>
inline void run_as_task_dynamic(Fun fun, context &ctx, const ::std::vector<task_dep_untyped> &deps) Executes
fun
within a task that takes a vector of untyped dependencies.This is an alternative for run_as_task which may take a variable number of dependencies
-
template<typename context_t, typename ...Deps>
inline runner_impl<context_t, Deps...> runner(context_t &ctx, task_dep<Deps>... deps) Helper to use algorithm using the ->* idiom instead of passing the implementation as an argument of run_as_task.
example: algorithm alg; alg.runner(ctx, lX.read(), lY.rw())->*[](context inner_ctx, logical_data<slice<double>> X, logical_data<slice<double>> Y) { inner_ctx.parallel_for(Y.shape(), X.rw(), Y.rw())->*[]__device__(size_t i, auto x, auto y) { y(i) = 2.0*x(i); }; };
Which is equivalent to: auto fn = [](context inner_ctx, logical_data<slice<double>> X, logical_data<slice<double>> Y) { inner_ctx.parallel_for(Y.shape(), X.rw(), Y.rw())->*[]__device__(size_t i, auto x, auto y) { y(i) = 2.0*x(i); } };
algorithm alg; alg.run_as_task(fn, ctx, lX.read(), lY.rw());
-
template<typename Fun, typename parent_ctx_t, typename ...Args>
inline void run(Fun fun, parent_ctx_t &parent_ctx, cudaStream_t stream, Args... args)
-
template<typename Fun, typename parent_ctx_t, typename task_t>
inline void run_dynamic(Fun fun, parent_ctx_t &parent_ctx, cudaStream_t stream, task_t &t)
-
inline algorithm(::std::string _symbol = "algorithm")