cuda::experimental::stf::stackable_ctx#
-
class stackable_ctx#
This class defines a context that behaves as a context which can have nested subcontexts (implemented as local CUDA graphs)
Public Types
-
template<typename T>
using logical_data_t = ::cuda::experimental::stf::stackable_logical_data<T>#
Public Functions
-
inline stackable_ctx()#
-
inline auto &get_node(size_t offset)#
-
inline int get_parent_offset(int offset) const#
-
inline const auto &get_children_offsets(int parent) const#
-
inline int get_root_offset() const#
-
inline int get_head_offset() const#
-
inline bool has_head_set() const#
True if the current thread has a head offset set (has entered the context).
-
inline void set_head_offset(int offset)#
- inline void push(
- const ::cuda::std::source_location loc = ::cuda::std::source_location::current()
-
inline void pop()#
- inline graph_scope_guard graph_scope(
- const ::cuda::std::source_location &loc = ::cuda::std::source_location::current()
-
template<typename T, typename ...Sizes>
inline auto logical_data( - size_t elements,
- Sizes... more_sizes
-
template<typename T, typename ...Sizes>
inline auto logical_data_no_export( - size_t elements,
- Sizes... more_sizes
-
inline stackable_logical_data<void_interface> token()#
-
template<typename ExecPlace, typename ...Deps, ::std::enable_if_t<::std::is_base_of_v<exec_place, ::std::decay_t<ExecPlace>>, int> = 0>
inline auto task(
)#
-
inline auto fence()#
-
template<typename T>
inline auto wait( - ::cuda::experimental::stf::stackable_logical_data<T> &ldata
-
inline auto get_dot()#
-
inline void pop_affinity() const#
-
inline auto ¤t_affinity() const#
-
inline const exec_place ¤t_exec_place() const#
-
inline auto &async_resources() const#
-
inline auto dot_section(::std::string symbol) const#
-
inline size_t task_count() const#
-
inline void finalize()#
- inline ::std::unique_lock<::std::shared_mutex> acquire_exclusive_lock(
-
template<typename ...Deps>
class deferred_task_builder# Defers task creation until all dependencies are known.
In a regular context, task() immediately creates the underlying task and add_deps() appends to it — access modes are already resolved.
In a stackable context, we cannot create the task immediately because add_deps() may later introduce additional dependencies on the same logical data with a different access mode (e.g. read initially, then write via add_deps). The data must be imported (pushed/frozen) with the combined mode (rw in that example), but once data is frozen with a given mode it cannot be upgraded.
deferred_task_builder therefore collects all dependencies first, combines access modes per logical data, validates and auto-pushes with the correct combined mode, and only then creates the real task via the underlying context. The task is concretized lazily on the first call to operator->* (when the task body is provided) or set_symbol.
Public Functions
-
template<typename ExecPlace>
inline deferred_task_builder( - stackable_ctx &sctx,
- int offset,
- ExecPlace &&exec_place,
- Deps&&... deps
-
inline auto &start()#
-
inline auto &set_symbol(::std::string s) &#
-
inline auto &&set_symbol(::std::string s) &&#
-
template<typename T>
inline auto get(size_t index) const#
Public Members
-
stackable_ctx &sctx_#
-
int offset_#
-
exec_place exec_place_#
-
::std::vector<additional_dep_info> additional_deps_#
-
::std::optional<::std::string> symbol_#
-
struct additional_dep_info#
Public Members
-
int logical_data_id#
-
access_mode mode#
-
::std::function<void(stackable_ctx&, int, access_mode)> validate_access_op#
-
::std::function<task_dep_untyped(int, access_mode)> resolve_op#
-
int logical_data_id#
-
template<typename ExecPlace>
-
class graph_scope_guard#
RAII wrapper for automatic push/pop management (lock_guard style)
This class provides automatic scope management for nested contexts, following the same semantics as std::lock_guard. The constructor calls push() and the destructor calls pop().
Usage (direct constructor style):
{ stackable_ctx::graph_scope_guard scope{ctx}; // nested context operations... }
Usage (factory method style):
{ auto scope = ctx.graph_scope(); // nested context operations... }
Public Types
-
using context_type = stackable_ctx#
Public Functions
- inline explicit graph_scope_guard(
- stackable_ctx &ctx,
- const ::cuda::std::source_location &loc = ::cuda::std::source_location::current()
-
inline ~graph_scope_guard()#
-
graph_scope_guard(const graph_scope_guard&) = delete#
-
graph_scope_guard &operator=(const graph_scope_guard&) = delete#
-
graph_scope_guard(graph_scope_guard&&) = delete#
-
graph_scope_guard &operator=(graph_scope_guard&&) = delete#
-
using context_type = stackable_ctx#
-
class impl#
Public Functions
-
inline impl()#
-
inline ~impl()#
-
template<typename ContextType>
inline async_resources_handle get_async_handle( - const ContextType &parent_ctx
Helper to get async handle from pool or create new one.
- inline void push(
- const ::cuda::std::source_location &loc,
- bool is_root = false,
- const push_while_config &config = push_while_config{}
Create a new nested level.
head_offset is the offset of thread’s current top context (-1 if none)
-
inline void _pop_prologue()#
-
inline void _pop_epilogue(event_list &finalize_prereqs)#
-
inline void pop()#
Terminate the current nested level and get back to the previous one.
-
inline int get_root_offset() const#
-
inline ::std::unique_ptr<ctx_node_base> &get_node(int offset)#
- inline const ::std::unique_ptr<ctx_node_base> &get_node(
- int offset
-
inline int get_head_offset() const#
-
inline bool has_head_set() const#
True if the current thread has a head offset set (has entered the context).
-
inline void set_head_offset(int offset)#
-
inline int get_parent_offset(int offset) const#
-
inline const auto &get_children_offsets(int parent) const#
- inline ::std::unique_lock<::std::shared_mutex> acquire_exclusive_lock(
-
inline impl()#
-
template<typename T>