cuda::experimental::stf::algorithm#
-
class algorithm#
Algorithms are a mechanism to implement reusable task sequences implemented by the means of CUDA graphs nested within a task.
The underlying CUDA graphs are cached so that they are temptatively reused when the algorithm is run again. Nested algorithms are internally implemented as child graphs.
Public Functions
-
inline algorithm(::std::string _symbol = "algorithm")#
-
template<typename Fun, typename parent_ctx_t, typename ...Args>
inline void run_in_graph( - Fun &&fun,
- parent_ctx_t &parent_ctx,
- cudaGraph_t graph,
- const Args&... args,
-
template<typename context_t, typename Fun, typename ...Deps>
inline void run_inline(
)#
-
template<typename Fun, typename ...Deps>
inline void run_as_task( - Fun &&fun,
- stream_ctx &ctx,
- task_dep<Deps>... deps,
-
template<typename Fun, typename ...Deps>
inline void run_as_task(
)#
-
template<typename Fun, typename ...Deps>
inline void run_as_task(
)# Executes
fun
within a task that takes a pack of dependencies.As an alternative, the run_as_task_dynamic may take a variable number of dependencies
-
template<typename Fun>
inline void run_as_task_dynamic( - Fun &&fun,
- stream_ctx &ctx,
- task_dep_vector_untyped deps,
-
template<typename Fun>
inline void run_as_task_dynamic( - Fun&&,
- graph_ctx&,
- const ::std::vector<task_dep_untyped>&,
-
template<typename Fun>
inline void run_as_task_dynamic( - Fun &&fun,
- context &ctx,
- task_dep_vector_untyped deps,
Executes
fun
within a task that takes a vector of untyped dependencies.This is an alternative for run_as_task which may take a variable number of dependencies
-
template<typename context_t, typename ...Deps>
inline runner_impl<context_t, Deps...> runner(
)# Helper to use algorithm using the ->* idiom instead of passing the implementation as an argument of run_as_task.
example: algorithm alg; alg.runner(ctx, lX.read(), lY.rw())->[](context inner_ctx, logical_data<slice<double>> X, logical_data<slice<double>> Y) { inner_ctx.parallel_for(Y.shape(), X.rw(), Y.rw())->
[]__device__(size_t i, auto
x, auto y) { y(i) = 2.0*x(i); }; };
Which is equivalent to: auto fn = [](context inner_ctx, logical_data<slice<double>> X, logical_data<slice<double>> Y) { inner_ctx.parallel_for(Y.shape(), X.rw(), Y.rw())->*[]__device__(size_t i, auto x, auto y) { y(i) = 2.0*x(i); } };
algorithm alg; alg.run_as_task(fn, ctx, lX.read(), lY.rw());
-
template<typename Fun, typename parent_ctx_t, typename ...Args>
inline void run( - Fun &&fun,
- parent_ctx_t &parent_ctx,
- cudaStream_t stream,
- const Args&... args,
-
template<typename Fun, typename parent_ctx_t, typename task_t>
inline void run_dynamic( - Fun &&fun,
- parent_ctx_t &parent_ctx,
- cudaStream_t stream,
- task_t &t,
-
inline algorithm(::std::string _symbol = "algorithm")#