Coverage for cuda / core / system / _event.pxi: 28.85%
52 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-22 01:37 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-22 01:37 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
6_EVENT_TYPE_MAPPING = {
7 nvml.EventType.NONE: EventType.NONE,
8 nvml.EventType.SINGLE_BIT_ECC_ERROR: EventType.SINGLE_BIT_ECC_ERROR,
9 nvml.EventType.DOUBLE_BIT_ECC_ERROR: EventType.DOUBLE_BIT_ECC_ERROR,
10 nvml.EventType.PSTATE: EventType.PSTATE,
11 nvml.EventType.XID_CRITICAL_ERROR: EventType.XID_CRITICAL_ERROR,
12 nvml.EventType.CLOCK: EventType.CLOCK,
13 nvml.EventType.POWER_SOURCE_CHANGE: EventType.POWER_SOURCE_CHANGE,
14 nvml.EventType.MIG_CONFIG_CHANGE: EventType.MIG_CONFIG_CHANGE,
15 nvml.EventType.SINGLE_BIT_ECC_ERROR_STORM: EventType.SINGLE_BIT_ECC_ERROR_STORM,
16 nvml.EventType.DRAM_RETIREMENT_EVENT: EventType.DRAM_RETIREMENT_EVENT,
17 nvml.EventType.DRAM_RETIREMENT_FAILURE: EventType.DRAM_RETIREMENT_FAILURE,
18 nvml.EventType.NON_FATAL_POISON_ERROR: EventType.NON_FATAL_POISON_ERROR,
19 nvml.EventType.FATAL_POISON_ERROR: EventType.FATAL_POISON_ERROR,
20 nvml.EventType.GPU_UNAVAILABLE_ERROR: EventType.GPU_UNAVAILABLE_ERROR,
21 nvml.EventType.GPU_RECOVERY_ACTION: EventType.GPU_RECOVERY_ACTION,
22}
25_EVENT_TYPE_INV_MAPPING = {v: k for k, v in _EVENT_TYPE_MAPPING.items()}
28cdef class EventData:
29 """
30 Data about a single event.
31 """
32 def __init__(self, event_data: nvml.EventData):
33 self._event_data = event_data
35 @property
36 def device(self) -> Device:
37 """
38 The device on which the event occurred.
39 """
40 device = Device.__new__(Device)
41 device._handle = self._event_data.device
42 return device
44 @property
45 def event_type(self) -> EventType:
46 """
47 The type of event that was triggered.
48 """
49 return _EVENT_TYPE_MAPPING[self._event_data.event_type]
51 @property
52 def event_data(self) -> int:
53 """
54 Returns Xid error for the device in the event of
55 :attr:`~cuda.core.system.EventType.XID_CRITICAL_ERROR`.
57 Raises :class:`ValueError` for other event types.
58 """
59 if self._event_data.event_type != nvml.EventType.XID_CRITICAL_ERROR:
60 raise ValueError("event_data is only available for Xid critical error events.")
61 return self._event_data.event_data
63 @property
64 def gpu_instance_id(self) -> int:
65 """
66 The GPU instance ID for MIG devices.
68 Only valid for events of type :attr:`EventType.XID_CRITICAL_ERROR`.
70 Raises :class:`ValueError` for other event types.
71 """
72 if self._event_data.event_type != nvml.EventType.XID_CRITICAL_ERROR:
73 raise ValueError("gpu_instance_id is only available for Xid critical error events.")
74 return self._event_data.gpu_instance_id
76 @property
77 def compute_instance_id(self) -> int:
78 """
79 The Compute instance ID for MIG devices.
81 Only valid for events of type :attr:`EventType.XID_CRITICAL_ERROR`.
83 Raises :class:`ValueError` for other event types.
84 """
85 if self._event_data.event_type != nvml.EventType.XID_CRITICAL_ERROR:
86 raise ValueError("compute_instance_id is only available for Xid critical error events.")
87 return self._event_data.compute_instance_id
90cdef class DeviceEvents:
91 """
92 Represents a set of events that can be waited on for a specific device.
93 """
94 cdef intptr_t _event_set
95 cdef intptr_t _device_handle
97 def __init__(self, device_handle: intptr_t, events: EventType | str | list[EventType | str]):
98 self._event_set = 0 1a
100 cdef unsigned long long event_bitmask
101 if isinstance(events, (str, EventType)): 1a
102 events = [events]
104 if isinstance(events, list): 1a
105 event_bitmask = 0 1a
106 for ev in events: 1a
107 try: 1a
108 ev_enum = _EVENT_TYPE_INV_MAPPING[ev] 1a
109 except KeyError:
110 raise ValueError(
111 f"Invalid event type: {ev}. "
112 f"Must be one of {list(EventType.__members__.values())}"
113 ) from None
114 event_bitmask |= <unsigned long long>int(ev_enum) 1a
115 else:
116 raise TypeError("events must be an EventType, str, or list of EventType or str") 1a
118 self._device_handle = device_handle 1a
119 self._event_set = nvml.event_set_create() 1a
120 # If this raises, the event needs to be freed and this is handled by
121 # this class's __dealloc__ method.
122 nvml.device_register_events(self._device_handle, event_bitmask, self._event_set) 1a
124 def __dealloc__(self):
125 if self._event_set != 0: 1a
126 nvml.event_set_free(self._event_set) 1a
128 def wait(self, timeout_ms: int = 0) -> EventData:
129 """
130 Wait for events in the event set.
132 For Fermi™ or newer fully supported devices.
134 If some events are ready to be delivered at the time of the call,
135 function returns immediately. If there are no events ready to be
136 delivered, function sleeps until event arrives but not longer than
137 specified timeout. If timeout passes, a
138 :class:`cuda.core.system.TimeoutError` is raised. This function in
139 certain conditions can return before specified timeout passes (e.g. when
140 interrupt arrives).
142 On Windows, in case of Xid error, the function returns the most recent
143 Xid error type seen by the system. If there are multiple Xid errors
144 generated before ``wait`` is invoked, then the last seen Xid
145 error type is returned for all Xid error events.
147 On Linux, every Xid error event would return the associated event data
148 and other information if applicable.
150 In MIG mode, if device handle is provided, the API reports all the
151 events for the available instances, only if the caller has appropriate
152 privileges. In absence of required privileges, only the events which
153 affect all the instances (i.e. whole device) are reported.
155 This API does not currently support per-instance event reporting using
156 MIG device handles.
158 Parameters
159 ----------
160 timeout_ms: int
161 The timeout in milliseconds. A value of 0 means to wait indefinitely.
163 Raises
164 ------
165 :class:`cuda.core.system.TimeoutError`
166 If the timeout expires before an event is received.
167 :class:`cuda.core.system.GpuIsLostError`
168 If the GPU has fallen off the bus or is otherwise inaccessible.
169 """
170 return EventData(nvml.event_set_wait_v2(self._event_set, timeout_ms)) 1a