cuda::experimental::stf::stackable_ctx#

class stackable_ctx#

This class defines a context that behaves as a context which can have nested subcontexts (implemented as local CUDA graphs)

Public Types

template<typename T> using logical_data_t = ::cuda::experimental::stf::stackable_logical_data<T>#

Public Functions

inline stackable_ctx()#

inline auto &get_node(size_t offset)#

inline int get_parent_offset(int offset) const#

inline const auto &get_children_offsets(int parent) const#

inline context &get_root_ctx()#

inline const context &get_root_ctx() const#

inline int get_root_offset() const#

inline context &get_ctx(int offset)#

inline const context &get_ctx(int offset) const#

inline int get_head_offset() const#

inline bool has_head_set() const#: True if the current thread has a head offset set (has entered the context).

inline void set_head_offset(int offset)#

inline void push( const ::cuda::std::source_location loc = ::cuda::std::source_location::current() )#

inline void pop()#

inline launchable_graph_handle pop_prologue()#

First phase of a re-launchable pop.

Runs the same prologue as pop() and finalizes the nested cudaGraph_t, but does NOT instantiate the cudaGraphExec_t, does NOT launch the graph, and does NOT release resources. Instantiation (cache lookup + cudaGraphInstantiate) is deferred to the first launchable_graph_handle exec()/launch() call, so callers that only consume handle.graph() never pay the instantiation cost. Returns a launchable_graph_handle the caller can use to launch the graph one or more times. pop_epilogue() must be called exactly once afterwards to release resources and destroy the node.

Only legal when the head context is a top-level graph (parent is the stream_ctx root). Aborts otherwise.

Defined out-of-line in stackable_ctx.cuh since the return type (launchable_graph_handle) is only declared there.

inline void pop_epilogue()#

Second phase of a re-launchable pop.

Releases resources, unfreezes any data that was pushed into the nested context, and destroys the node. Invalidates every launchable_graph_handle that was produced by the matching pop_prologue().

inline launchable_graph pop_prologue_shared()#

Shared-ownership flavor of pop_prologue().

Runs pop_prologue() and wraps the resulting launchable_graph_handle into a copyable / storable launchable_graph whose destructor runs pop_epilogue() when the last shared copy dies. Use this when you want to build a graph, stash it as a data member / in a container / return it across function boundaries, and launch it many times before releasing.

Defined out-of-line in stackable_ctx.cuh since the return type is only declared there.

inline graph_scope_guard graph_scope( ::cuda::std::source_location loc = ::cuda::std::source_location::current() )#

template<typename T> inline auto logical_data(shape_of<T> s)#

template<typename T, typename ...Sizes> inline auto logical_data( size_t elements, Sizes... more_sizes )#

template<typename T> inline auto logical_data_no_export( shape_of<T> s )#

template<typename T, typename ...Sizes> inline auto logical_data_no_export( size_t elements, Sizes... more_sizes )#

inline stackable_logical_data<void_interface> token()#

template<typename ...Pack> inline auto logical_data(Pack&&... pack)#

template<typename ...Pack> inline void validate_and_push( int offset, const Pack&... pack ) const#

template<typename ExecPlace, typename ...Deps, ::std::enable_if_t<::std::is_base_of_v<exec_place, ::std::decay_t<ExecPlace>>, int> = 0> inline auto task( ExecPlace &&e_place, Deps&&... deps )#

template<typename ...Deps> inline auto task(Deps&&... deps)#

template<typename ...Pack> inline auto host_launch(Pack&&... pack)#

inline auto fence()#

template<typename T> inline auto wait( ::cuda::experimental::stf::stackable_logical_data<T> &ldata )#

inline auto get_dot()#

template<typename ...Pack> inline void push_affinity( Pack&&... pack ) const#

inline void pop_affinity() const#

inline auto &current_affinity() const#

inline const exec_place &current_exec_place() const#

inline auto &async_resources() const#

inline auto dot_section(::std::string symbol) const#

inline size_t task_count() const#

inline void finalize()#

inline ::std::shared_lock<::std::shared_mutex> acquire_shared_lock( ) const#

inline ::std::unique_lock<::std::shared_mutex> acquire_exclusive_lock( )#

Public Members

::std::shared_ptr<impl> pimpl#

template<typename ...Deps> class deferred_task_builder#

Defers task creation until all dependencies are known.

In a regular context, task() immediately creates the underlying task and add_deps() appends to it — access modes are already resolved.

In a stackable context, we cannot create the task immediately because add_deps() may later introduce additional dependencies on the same logical data with a different access mode (e.g. read initially, then write via add_deps). The data must be imported (pushed/frozen) with the combined mode (rw in that example), but once data is frozen with a given mode it cannot be upgraded.

deferred_task_builder therefore collects all dependencies first, combines access modes per logical data, validates and auto-pushes with the correct combined mode, and only then creates the real task via the underlying context. The task is concretized lazily on the first call to operator->* (when the task body is provided) or set_symbol.

Public Functions

template<typename ExecPlace> inline deferred_task_builder( stackable_ctx &sctx, int offset, ExecPlace &&exec_place, Deps&&... deps )#

template<typename ...MoreDeps> inline auto &add_deps( MoreDeps&&... deps )#

template<typename F> inline auto operator->*(F &&f)#

inline auto &start()#

inline auto &set_symbol(::std::string s) &#

inline auto &&set_symbol(::std::string s) &&#

template<typename ExecPlace> inline auto &set_exec_place( ExecPlace &&ep ) &#

template<typename ExecPlace> inline auto &&set_exec_place( ExecPlace &&ep ) &&#

template<typename T> inline auto get(size_t index) const#

Public Members

stackable_ctx &sctx_#

int offset_#

exec_place exec_place_#

::std::tuple<Deps...> task_deps_tuple_#

::std::vector<additional_dep_info> additional_deps_#

::std::optional<::cuda::experimental::stf::task> concrete_task_#

::std::optional<::std::string> symbol_#

struct additional_dep_info#

Public Members

int logical_data_id#

access_mode mode#

::std::function<void(stackable_ctx&, int, access_mode)> validate_access_op#

::std::function<task_dep_untyped(int, access_mode)> resolve_op#

class graph_scope_guard#

RAII wrapper for automatic push/pop management (lock_guard style)

This class provides automatic scope management for nested contexts, following the same semantics as std::lock_guard. The constructor calls push() and the destructor calls pop().

Usage (direct constructor style):

{
  stackable_ctx::graph_scope_guard scope{ctx};
  // nested context operations...
}

Usage (factory method style):

{
  auto scope = ctx.graph_scope();
  // nested context operations...
}

Public Types

using context_type = stackable_ctx #

Public Functions

inline explicit graph_scope_guard( stackable_ctx &ctx, ::cuda::std::source_location loc = ::cuda::std::source_location::current() )#

inline ~graph_scope_guard()#

graph_scope_guard(const graph_scope_guard&) = delete#

graph_scope_guard &operator=(const graph_scope_guard&) = delete#

graph_scope_guard(graph_scope_guard&&) = delete#

graph_scope_guard &operator=(graph_scope_guard&&) = delete#

class impl#

Public Functions

inline impl()#

inline ~impl()#

impl(const impl&) = delete#

impl &operator=(const impl&) = delete#

impl(impl&&) noexcept = delete#

impl &operator=(impl&&) noexcept = delete#

template<typename ContextType> inline async_resources_handle get_async_handle( const ContextType &parent_ctx )#: Helper to get async handle from pool or create new one.

inline void push( const ::cuda::std::source_location &loc, bool is_root = false, const push_while_config &config = push_while_config{} )#

Create a new nested level.

head_offset is the offset of thread’s current top context (-1 if none)

inline void _pop_prologue()#

inline void _pop_epilogue(event_list &finalize_prereqs)#

inline void pop()#: Terminate the current nested level and get back to the previous one.

inline pop_prologue_result pop_prologue_impl()#

First phase of a two-phase pop: runs _pop_prologue() and prepare_graph() so the caller can obtain the finalized graph and launch it one or more times before calling pop_epilogue_impl().

The cudaGraphExec_t is instantiated lazily on the first exec()/launch(), not here.

Only legal on a non-nested graph_ctx_node (top-level graph whose parent is the stream_ctx root).

inline void pop_epilogue_impl()#

Second phase of a two-phase pop: finalises resources and runs _pop_epilogue() to actually destroy the node.

Invalidates every launchable_graph_handle that was produced by the matching pop_prologue().

inline void launch_prepared_graph( int node_offset, cudaStream_t stream )#

Dispatch one launch of the executable graph prepared by pop_prologue_impl().

Called from launchable_graph_handle::launch().

Triggers cache lookup + instantiation on first call (idempotent afterwards). The prereq sync into stream is performed by launch_once on its first invocation.

inline ::std::shared_ptr<cudaGraphExec_t> prepare_handle_for_exec( int node_offset )#

Lazily instantiate the graph (cache query + cudaGraphInstantiate if not cached) and sync the support_stream behind the parent’s freeze events, then return the shared executable graph.

Called from launchable_graph_handle::exec().

Idempotent: subsequent calls skip both steps. The returned shared_ptr stays valid until pop_epilogue().

inline void prepare_handle_for_graph(int node_offset)#

Lazily sync the support stream behind the parent’s freeze events without instantiating the exec graph.

Called from launchable_graph_handle::graph() so that a caller embedding the nested graph as a child node can treat handle.stream() as a ready event source for dep-A ordering.

Idempotent. Does NOT trigger cudaGraphInstantiate.

inline int get_root_offset() const#

inline context &get_root_ctx()#

inline const context &get_root_ctx() const#

inline ::std::unique_ptr<ctx_node_base> &get_node(int offset)#

inline const ::std::unique_ptr<ctx_node_base> &get_node( int offset ) const#

inline context &get_ctx(int offset)#

inline const context &get_ctx(int offset) const#

inline int get_head_offset() const#

inline bool has_head_set() const#: True if the current thread has a head offset set (has entered the context).

inline void set_head_offset(int offset)#

inline int get_parent_offset(int offset) const#

inline const auto &get_children_offsets(int parent) const#

inline ::std::shared_lock<::std::shared_mutex> acquire_shared_lock( ) const#

inline ::std::unique_lock<::std::shared_mutex> acquire_exclusive_lock( )#

struct pop_prologue_result#

Public Members

::std::shared_ptr<int> token#

cudaGraph_t graph#

cudaStream_t support_stream#

int node_offset#

class launchable_graph#

Shared-ownership, storable handle for a re-launchable popped graph.

Returned by stackable_ctx::pop_prologue_shared(). Copies share a single underlying state; when the last copy is destroyed (or the last copy is explicitly reset()), pop_epilogue() runs on the originating context.

Unlike launchable_graph_scope, this type is copyable and movable, so it can be stored as a data member, placed in containers, or returned from a factory —

making it a natural fit for a classic “build once, launch many

times, release later” cache.

Example — build once, store as a data member, launch repeatedly:

class SimEngine {
public:
  void build(size_t N, double alpha) {
    ctx_.push();
    auto lx = ctx_.logical_data(shape_of<slice<double>>(N));
    ctx_.parallel_for(lx.shape(), lx.write())->*[] __device__(size_t i, auto x) {
      x(i) = 1.0;
    };
    ctx_.parallel_for(lx.shape(), lx.rw())->*[=] __device__(size_t i, auto x) {
      x(i) += alpha;
    };
    step_graph_ = ctx_.pop_prologue_shared();
  }

  void step() { step_graph_.launch(); }

private:
  stackable_ctx ctx_;
  stackable_ctx::launchable_graph step_graph_;
};

Example — cache keyed by shape:

std::unordered_map<size_t, stackable_ctx::launchable_graph> cache;
if (auto it = cache.find(N); it == cache.end()) {
  ctx.push();
  // ... build graph ...
  cache.emplace(N, ctx.pop_prologue_shared());
}
cache[N].launch();

Example — embed into a larger graph instead of launching:

auto sub = ctx.pop_prologue_shared();
cudaGraph_t outer = nullptr;
cudaGraphCreate(&outer, 0);
cudaGraphNode_t child{};
// graph() does NOT instantiate; sub.stream() is a valid event source.
cudaGraphAddChildGraphNode(&child, outer, nullptr, 0, sub.graph());

Public Functions

launchable_graph() = default#

launchable_graph(const launchable_graph&) = default#

launchable_graph(launchable_graph&&) noexcept = default#

launchable_graph &operator=(const launchable_graph&) = default#

launchable_graph &operator=(launchable_graph&&) noexcept = default#

~launchable_graph() = default#

inline void launch()#: Launch the graph once on its support stream.

inline cudaGraphExec_t exec() const#

Underlying executable graph.

Triggers lazy instantiation + dep-A sync on the first call (same contract as launchable_graph_handle::exec()).

inline cudaStream_t stream() const#: Support stream the graph was prepared against. Purely observational.

inline cudaGraph_t graph() const#

Underlying cudaGraph_t topology (for embedding as a child graph).

Triggers lazy dep-A sync but does NOT call cudaGraphInstantiate.

inline bool valid() const noexcept#

True iff this copy still holds a shared reference and the underlying pop has not been epilogued (e.g.

manually via ctx.pop_epilogue()).

inline explicit operator bool() const noexcept#

inline long use_count() const noexcept#

Number of live shared copies referring to the same graph.

Debug introspection only. Returns 0 for a default-constructed / moved-from instance.

inline void reset() noexcept#

Drop this shared reference eagerly.

When this was the last copy, pop_epilogue() runs now instead of at destruction time. Idempotent.

class launchable_graph_scope#

RAII wrapper for a re-launchable pop scope.

On construction, calls ctx.push(). The caller builds the nested graph body, then uses launch() (or exec() / stream() / graph()) as many times as desired; the first such call triggers ctx.pop_prologue(). The destructor (or an explicit release()) runs ctx.pop_epilogue() and invalidates the handle.

Usage:

stackable_ctx ctx;
{
  stackable_ctx::launchable_graph_scope scope{ctx};
  // ... build graph contents as if inside ctx.push()/ctx.pop() ...
  for (int i = 0; i < N; ++i) scope.launch();
} // pop_epilogue() runs automatically here

Public Types

using context_type = stackable_ctx #

Public Functions

inline explicit launchable_graph_scope( stackable_ctx &ctx, ::cuda::std::source_location loc = ::cuda::std::source_location::current() )#

inline ~launchable_graph_scope() noexcept#

launchable_graph_scope(const launchable_graph_scope&) = delete#

launchable_graph_scope &operator=( const launchable_graph_scope& ) = delete#

launchable_graph_scope(launchable_graph_scope&&) = delete#

launchable_graph_scope &operator=(launchable_graph_scope&&) = delete#

inline void launch()#: Launch the graph once. The first call triggers pop_prologue().

inline cudaGraphExec_t exec()#: Expose the executable graph. Triggers pop_prologue() on demand.

inline cudaStream_t stream()#: Expose the support stream. Triggers pop_prologue() on demand.

inline cudaGraph_t graph()#

Expose the underlying CUDA graph (for embedding into another graph via cudaGraphAddChildGraphNode).

Triggers pop_prologue() on demand.

inline void release() noexcept#

Explicitly commit the pop (idempotent).

Runs pop_prologue() (if not already done) and pop_epilogue(). After release(), further calls to launch()/exec()/stream()/graph() are invalid.