Coverage for cuda / core / system / _system_events.pyx: 62.50%

40 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-22 01:37 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5  

6from libc.stdint cimport intptr_t 

7  

8from cuda.bindings import nvml 

9  

10from ._nvml_context cimport initialize 

11  

12from . import _device 

13from cuda.core.system.typing import SystemEventType 

14  

15  

16_SYSTEM_EVENT_TYPE_MAPPING = { 

17 nvml.SystemEventType.GPU_DRIVER_UNBIND: SystemEventType.UNBIND, 

18 nvml.SystemEventType.GPU_DRIVER_BIND: SystemEventType.BIND, 

19} 

20  

21  

22_SYSTEM_EVENT_TYPE_INV_MAPPING = {v: k for k, v in _SYSTEM_EVENT_TYPE_MAPPING.items()} 

23  

24  

25cdef class SystemEvent: 

26 """ 

27 Data about a collection of system events. 

28 """ 

29 def __init__(self, event_data: nvml.SystemEventData_v1): 

30 assert len(event_data) == 1 

31 self._event_data = event_data 

32  

33 @property 

34 def event_type(self) -> SystemEventType: 

35 """ 

36 The :obj:`~SystemEventType` that was triggered. 

37 """ 

38 return _SYSTEM_EVENT_TYPE_MAPPING[self._event_data.event_type] 

39  

40 @property 

41 def gpu_id(self) -> int: 

42 """ 

43 The GPU ID in PCI ID format. 

44 """ 

45 return self._event_data.gpu_id 

46  

47 @property 

48 def device(self) -> _device.Device: 

49 """ 

50 The :obj:`~_device.Device` associated with this event. 

51 """ 

52 return _device.Device(pci_bus_id=self.gpu_id) 

53  

54  

55cdef class SystemEvents: 

56 """ 

57 Data about a collection of system events. 

58 """ 

59 def __init__(self, event_data: nvml.SystemEventData_v1): 

60 self._event_data = event_data 

61  

62 def __len__(self): 

63 return len(self._event_data) 

64  

65 def __getitem__(self, idx: int) -> SystemEvent: 

66 """ 

67 Get the :obj:`~_system_events.SystemEvent` at the specified index. 

68 """ 

69 return SystemEvent(self._event_data[idx]) 

70  

71  

72cdef class RegisteredSystemEvents: 

73 """ 

74 Represents a set of events that can be waited on for a specific device. 

75 """ 

76 cdef intptr_t _event_set 

77  

78 def __init__(self, events: SystemEventType | str | list[SystemEventType | str]): 

79 cdef unsigned long long event_bitmask 

80 if isinstance(events, (str, SystemEventType)): 1a

81 events = [events] 

82  

83 if isinstance(events, list): 1a

84 event_bitmask = 0 1a

85 for ev in events: 1a

86 try: 1a

87 ev_enum = _SYSTEM_EVENT_TYPE_INV_MAPPING[ev] 1a

88 except KeyError: 

89 raise ValueError( 

90 f"Invalid event type: {ev}. " 

91 f"Must be one of {list(SystemEventType.__members__.values())}" 

92 ) from None 

93 event_bitmask |= <unsigned long long>int(ev_enum) 1a

94 else: 

95 raise TypeError("events must be an SystemEventType, str, or list of SystemEventType or str") 

96  

97 initialize() 1a

98  

99 self._event_set = 0 1a

100 self._event_set = nvml.system_event_set_create() 1a

101 # If this raises, the event needs to be freed and this is handled by 

102 # this class's __dealloc__ method. 

103 nvml.system_register_events(event_bitmask, self._event_set) 1a

104  

105 def __dealloc__(self): 

106 if self._event_set != 0: 1a

107 nvml.system_event_set_free(self._event_set) 1a

108  

109 def wait(self, timeout_ms: int = 0, buffer_size: int = 1) -> SystemEvents: 

110 """ 

111 Wait for events in the system event set. 

112  

113 For Fermi™ or newer fully supported devices. 

114  

115 If some events are ready to be delivered at the time of the call, 

116 function returns immediately. If there are no events ready to be 

117 delivered, function sleeps till event arrives but not longer than 

118 specified timeout. If timeout passes, a 

119 :class:`cuda.core.system.TimeoutError` is raised. This function in 

120 certain conditions can return before specified timeout passes (e.g. when 

121 interrupt arrives) 

122  

123 Parameters 

124 ---------- 

125 timeout_ms: int 

126 The timeout in milliseconds. A value of 0 means to wait indefinitely. 

127 buffer_size: int 

128 The maximum number of events to retrieve. Must be at least 1. 

129  

130 Returns 

131 ------- 

132 :obj:`~_system_events.SystemEvents` 

133 A set of events that were received. The number of events returned may 

134 be less than the specified buffer size if fewer events were available. 

135  

136 Raises 

137 ------ 

138 :class:`cuda.core.system.TimeoutError` 

139 If the timeout expires before an event is received. 

140 :class:`cuda.core.system.GpuIsLostError` 

141 If the GPU has fallen off the bus or is otherwise inaccessible. 

142 """ 

143 return SystemEvents(nvml.system_event_set_wait(self._event_set, timeout_ms, buffer_size)) 1a

144  

145  

146def register_events(events: SystemEventType | str | list[SystemEventType | str]) -> RegisteredSystemEvents: 

147 """ 

148 Starts recording of events on test system. 

149  

150 For Linux only. 

151  

152 All events that occurred before this call are not recorded. Wait for events 

153 using the :meth:`RegisteredSystemEvents.wait` method on the result. 

154  

155 Examples 

156 -------- 

157 >>> from cuda.core import system 

158 >>> events = system.register_events([SystemEventType.UNBIND]) 

159 >>> while event := events.wait(timeout_ms=10000): 

160 ... print(f"Event {event.event_type} occurred.") 

161  

162 Parameters 

163 ---------- 

164 events: SystemEventType, str, or list of SystemEventType or str 

165 The event type or list of event types to register for this device. 

166  

167 Returns 

168 ------- 

169 :obj:`~_system_events.RegisteredSystemEvents` 

170 An object representing the registered events. Call 

171 :meth:`~_system_events.RegisteredSystemEvents.wait` on this object to wait for events. 

172  

173 Raises 

174 ------ 

175 :class:`cuda.core.system.NotSupportedError` 

176 None of the requested event types are registered. 

177 """ 

178 return RegisteredSystemEvents(events) 1a

179  

180  

181__all__ = [ 

182 "register_events", 

183]