Coverage for cuda / core / system / _event.pxi: 35.71%
28 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-08 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-08 01:07 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
6EventType = nvml.EventType
9cdef class EventData:
10 """
11 Data about a single event.
12 """
13 def __init__(self, event_data: nvml.EventData):
14 self._event_data = event_data
16 @property
17 def device(self) -> Device:
18 """
19 The device on which the event occurred.
20 """
21 device = Device.__new__()
22 device._handle = self._event_data.device
23 return device
25 @property
26 def event_type(self) -> EventType:
27 """
28 The type of event that was triggered.
29 """
30 return EventType(self._event_data.event_type)
32 @property
33 def event_data(self) -> int:
34 """
35 Returns Xid error for the device in the event of
36 :member:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`.
38 Raises :class:`ValueError` for other event types.
39 """
40 if self.event_type != EventType.EVENT_TYPE_XID_CRITICAL_ERROR:
41 raise ValueError("event_data is only available for Xid critical error events.")
42 return self._event_data.event_data
44 @property
45 def gpu_instance_id(self) -> int:
46 """
47 The GPU instance ID for MIG devices.
49 Only valid for events of type :attr:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`.
51 Raises :class:`ValueError` for other event types.
52 """
53 if self.event_type != EventType.EVENT_TYPE_XID_CRITICAL_ERROR:
54 raise ValueError("gpu_instance_id is only available for Xid critical error events.")
55 return self._event_data.gpu_instance_id
57 @property
58 def compute_instance_id(self) -> int:
59 """
60 The Compute instance ID for MIG devices.
62 Only valid for events of type :attr:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`.
64 Raises :class:`ValueError` for other event types.
65 """
66 if self.event_type != EventType.EVENT_TYPE_XID_CRITICAL_ERROR:
67 raise ValueError("compute_instance_id is only available for Xid critical error events.")
68 return self._event_data.compute_instance_id
71cdef class DeviceEvents:
72 """
73 Represents a set of events that can be waited on for a specific device.
74 """
75 cdef intptr_t _event_set
76 cdef intptr_t _device_handle
78 def __init__(self, device_handle: intptr_t, events: EventType | int | list[EventType | int]):
79 cdef unsigned long long event_bitmask
80 if isinstance(events, (int, EventType)): 1a
81 event_bitmask = <unsigned long long>int(events) 1a
82 elif isinstance(events, list): 1a
83 event_bitmask = 0 1a
84 for ev in events: 1a
85 event_bitmask |= <unsigned long long>int(ev)
86 else:
87 raise TypeError("events must be an EventType, int, or list of EventType or int")
89 self._device_handle = device_handle 1a
90 self._event_set = nvml.event_set_create() 1a
91 # If this raises, the event needs to be freed and this is handled by
92 # this class's __dealloc__ method.
93 nvml.device_register_events(self._device_handle, event_bitmask, self._event_set) 1a
95 def __dealloc__(self):
96 nvml.event_set_free(self._event_set) 1a
98 def wait(self, timeout_ms: int = 0) -> EventData:
99 """
100 Wait for events in the event set.
102 For Fermi™ or newer fully supported devices.
104 If some events are ready to be delivered at the time of the call,
105 function returns immediately. If there are no events ready to be
106 delivered, function sleeps until event arrives but not longer than
107 specified timeout. If timeout passes, a
108 :class:`cuda.core.system.TimeoutError` is raised. This function in
109 certain conditions can return before specified timeout passes (e.g. when
110 interrupt arrives).
112 On Windows, in case of Xid error, the function returns the most recent
113 Xid error type seen by the system. If there are multiple Xid errors
114 generated before ``wait`` is invoked, then the last seen Xid
115 error type is returned for all Xid error events.
117 On Linux, every Xid error event would return the associated event data
118 and other information if applicable.
120 In MIG mode, if device handle is provided, the API reports all the
121 events for the available instances, only if the caller has appropriate
122 privileges. In absence of required privileges, only the events which
123 affect all the instances (i.e. whole device) are reported.
125 This API does not currently support per-instance event reporting using
126 MIG device handles.
128 Parameters
129 ----------
130 timeout_ms: int
131 The timeout in milliseconds. A value of 0 means to wait indefinitely.
133 Raises
134 ------
135 :class:`cuda.core.system.TimeoutError`
136 If the timeout expires before an event is received.
137 :class:`cuda.core.system.GpuIsLostError`
138 If the GPU has fallen off the bus or is otherwise inaccessible.
139 """
140 return EventData(nvml.event_set_wait_v2(self._event_set, timeout_ms)) 1a