cuda::experimental::stf::stackable_ctx#

class stackable_ctx#

This class defines a context that behaves as a context which can have nested subcontexts (implemented as local CUDA graphs)

Public Types

template<typename T>
using logical_data_t = ::cuda::experimental::stf::stackable_logical_data<T>#

Public Functions

inline stackable_ctx()#
inline auto &get_node(size_t offset)#
inline int get_parent_offset(int offset) const#
inline const auto &get_children_offsets(int parent) const#
inline context &get_root_ctx()#
inline const context &get_root_ctx() const#
inline int get_root_offset() const#
inline context &get_ctx(int offset)#
inline const context &get_ctx(int offset) const#
inline int get_head_offset() const#
inline bool has_head_set() const#

True if the current thread has a head offset set (has entered the context).

inline void set_head_offset(int offset)#
inline void push(
const ::cuda::std::source_location loc = ::cuda::std::source_location::current()
)#
inline void pop()#
inline graph_scope_guard graph_scope(
const ::cuda::std::source_location &loc = ::cuda::std::source_location::current()
)#
template<typename T>
inline auto logical_data(shape_of<T> s)#
template<typename T, typename ...Sizes>
inline auto logical_data(
size_t elements,
Sizes... more_sizes
)#
template<typename T>
inline auto logical_data_no_export(
shape_of<T> s
)#
template<typename T, typename ...Sizes>
inline auto logical_data_no_export(
size_t elements,
Sizes... more_sizes
)#
inline stackable_logical_data<void_interface> token()#
template<typename ...Pack>
inline auto logical_data(Pack&&... pack)#
template<typename ...Pack>
inline void validate_and_push(
int offset,
const Pack&... pack
) const#
template<typename ExecPlace, typename ...Deps, ::std::enable_if_t<::std::is_base_of_v<exec_place, ::std::decay_t<ExecPlace>>, int> = 0>
inline auto task(
ExecPlace &&e_place,
Deps&&... deps
)#
template<typename ...Deps>
inline auto task(Deps&&... deps)#
template<typename ...Pack>
inline auto host_launch(Pack&&... pack)#
inline auto fence()#
template<typename T>
inline auto wait(
::cuda::experimental::stf::stackable_logical_data<T> &ldata
)#
inline auto get_dot()#
template<typename ...Pack>
inline void push_affinity(
Pack&&... pack
) const#
inline void pop_affinity() const#
inline auto &current_affinity() const#
inline const exec_place &current_exec_place() const#
inline auto &async_resources() const#
inline auto dot_section(::std::string symbol) const#
inline size_t task_count() const#
inline void finalize()#
inline ::std::shared_lock<::std::shared_mutex> acquire_shared_lock(
) const#
inline ::std::unique_lock<::std::shared_mutex> acquire_exclusive_lock(
)#

Public Members

::std::shared_ptr<impl> pimpl#
template<typename ...Deps>
class deferred_task_builder#

Defers task creation until all dependencies are known.

In a regular context, task() immediately creates the underlying task and add_deps() appends to it &#8212; access modes are already resolved.

In a stackable context, we cannot create the task immediately because add_deps() may later introduce additional dependencies on the same logical data with a different access mode (e.g. read initially, then write via add_deps). The data must be imported (pushed/frozen) with the combined mode (rw in that example), but once data is frozen with a given mode it cannot be upgraded.

deferred_task_builder therefore collects all dependencies first, combines access modes per logical data, validates and auto-pushes with the correct combined mode, and only then creates the real task via the underlying context. The task is concretized lazily on the first call to operator->* (when the task body is provided) or set_symbol.

Public Functions

template<typename ExecPlace>
inline deferred_task_builder(
stackable_ctx &sctx,
int offset,
ExecPlace &&exec_place,
Deps&&... deps
)#
template<typename ...MoreDeps>
inline auto &add_deps(
MoreDeps&&... deps
)#
template<typename F>
inline auto operator->*(F &&f)#
inline auto &start()#
inline auto &set_symbol(::std::string s) &#
inline auto &&set_symbol(::std::string s) &&#
template<typename ExecPlace>
inline auto &set_exec_place(
ExecPlace &&ep
) &#
template<typename ExecPlace>
inline auto &&set_exec_place(
ExecPlace &&ep
) &&#
template<typename T>
inline auto get(size_t index) const#

Public Members

stackable_ctx &sctx_#
int offset_#
exec_place exec_place_#
::std::tuple<Deps...> task_deps_tuple_#
::std::vector<additional_dep_info> additional_deps_#
::std::optional<::cuda::experimental::stf::task> concrete_task_#
::std::optional<::std::string> symbol_#
struct additional_dep_info#

Public Members

int logical_data_id#
access_mode mode#
::std::function<void(stackable_ctx&, int, access_mode)> validate_access_op#
::std::function<task_dep_untyped(int, access_mode)> resolve_op#
class graph_scope_guard#

RAII wrapper for automatic push/pop management (lock_guard style)

This class provides automatic scope management for nested contexts, following the same semantics as std::lock_guard. The constructor calls push() and the destructor calls pop().

Usage (direct constructor style):

{
  stackable_ctx::graph_scope_guard scope{ctx};
  // nested context operations...
}

Usage (factory method style):

{
  auto scope = ctx.graph_scope();
  // nested context operations...
}

Public Types

using context_type = stackable_ctx#

Public Functions

inline explicit graph_scope_guard(
stackable_ctx &ctx,
const ::cuda::std::source_location &loc = ::cuda::std::source_location::current()
)#
inline ~graph_scope_guard()#
graph_scope_guard(const graph_scope_guard&) = delete#
graph_scope_guard &operator=(const graph_scope_guard&) = delete#
graph_scope_guard(graph_scope_guard&&) = delete#
graph_scope_guard &operator=(graph_scope_guard&&) = delete#
class impl#

Public Functions

inline impl()#
inline ~impl()#
impl(const impl&) = delete#
impl &operator=(const impl&) = delete#
impl(impl&&) noexcept = delete#
impl &operator=(impl&&) noexcept = delete#
template<typename ContextType>
inline async_resources_handle get_async_handle(
const ContextType &parent_ctx
)#

Helper to get async handle from pool or create new one.

inline void push(
const ::cuda::std::source_location &loc,
bool is_root = false,
const push_while_config &config = push_while_config{}
)#

Create a new nested level.

head_offset is the offset of thread’s current top context (-1 if none)

inline void _pop_prologue()#
inline void _pop_epilogue(event_list &finalize_prereqs)#
inline void pop()#

Terminate the current nested level and get back to the previous one.

inline int get_root_offset() const#
inline context &get_root_ctx()#
inline const context &get_root_ctx() const#
inline ::std::unique_ptr<ctx_node_base> &get_node(int offset)#
inline const ::std::unique_ptr<ctx_node_base> &get_node(
int offset
) const#
inline context &get_ctx(int offset)#
inline const context &get_ctx(int offset) const#
inline int get_head_offset() const#
inline bool has_head_set() const#

True if the current thread has a head offset set (has entered the context).

inline void set_head_offset(int offset)#
inline int get_parent_offset(int offset) const#
inline const auto &get_children_offsets(int parent) const#
inline ::std::shared_lock<::std::shared_mutex> acquire_shared_lock(
) const#
inline ::std::unique_lock<::std::shared_mutex> acquire_exclusive_lock(
)#