Coverage for cuda / core / system / _system_events.pyx: 64.29%

28 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-29 01:27 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5  

6from libc.stdint cimport intptr_t 

7  

8from cuda.bindings import nvml 

9  

10from ._nvml_context cimport initialize 

11  

12from . import _device 

13  

14  

15SystemEventType = nvml.SystemEventType 

16  

17  

18cdef class SystemEvent: 

19 """ 

20 Data about a collection of system events. 

21 """ 

22 def __init__(self, event_data: nvml.SystemEventData_v1): 

23 assert len(event_data) == 1 

24 self._event_data = event_data 

25  

26 @property 

27 def event_type(self) -> SystemEventType: 

28 """ 

29 The :obj:`~SystemEventType` that was triggered. 

30 """ 

31 return SystemEventType(self._event_data.event_type) 

32  

33 @property 

34 def gpu_id(self) -> int: 

35 """ 

36 The GPU ID in PCI ID format. 

37 """ 

38 return self._event_data.gpu_id 

39  

40 @property 

41 def device(self) -> _device.Device: 

42 """ 

43 The :obj:`~_device.Device` associated with this event. 

44 """ 

45 return _device.Device(pci_bus_id=self.gpu_id) 

46  

47  

48cdef class SystemEvents: 

49 """ 

50 Data about a collection of system events. 

51 """ 

52 def __init__(self, event_data: nvml.SystemEventData_v1): 

53 self._event_data = event_data 

54  

55 def __len__(self): 

56 return len(self._event_data) 

57  

58 def __getitem__(self, idx: int) -> SystemEvent: 

59 """ 

60 Get the :obj:`~_system_events.SystemEvent` at the specified index. 

61 """ 

62 return SystemEvent(self._event_data[idx]) 

63  

64  

65cdef class RegisteredSystemEvents: 

66 """ 

67 Represents a set of events that can be waited on for a specific device. 

68 """ 

69 cdef intptr_t _event_set 

70  

71 def __init__(self, events: SystemEventType | int | list[SystemEventType | int]): 

72 cdef unsigned long long event_bitmask 

73 if isinstance(events, (int, SystemEventType)): 1a

74 event_bitmask = <unsigned long long>int(events) 

75 elif isinstance(events, list): 1a

76 event_bitmask = 0 1a

77 for ev in events: 1a

78 event_bitmask |= <unsigned long long>int(ev) 1a

79 else: 

80 raise TypeError("events must be an SystemEventType, int, or list of SystemEventType or int") 

81  

82 initialize() 1a

83  

84 self._event_set = nvml.system_event_set_create() 1a

85 # If this raises, the event needs to be freed and this is handled by 

86 # this class's __dealloc__ method. 

87 nvml.system_register_events(event_bitmask, self._event_set) 1a

88  

89 def __dealloc__(self): 

90 nvml.system_event_set_free(self._event_set) 1a

91  

92 def wait(self, timeout_ms: int = 0, buffer_size: int = 1) -> SystemEvents: 

93 """ 

94 Wait for events in the system event set. 

95  

96 For Fermi™ or newer fully supported devices. 

97  

98 If some events are ready to be delivered at the time of the call, 

99 function returns immediately. If there are no events ready to be 

100 delivered, function sleeps till event arrives but not longer than 

101 specified timeout. If timeout passes, a 

102 :class:`cuda.core.system.TimeoutError` is raised. This function in 

103 certain conditions can return before specified timeout passes (e.g. when 

104 interrupt arrives) 

105  

106 Parameters 

107 ---------- 

108 timeout_ms: int 

109 The timeout in milliseconds. A value of 0 means to wait indefinitely. 

110 buffer_size: int 

111 The maximum number of events to retrieve. Must be at least 1. 

112  

113 Returns 

114 ------- 

115 :obj:`~_system_events.SystemEvents` 

116 A set of events that were received. The number of events returned may 

117 be less than the specified buffer size if fewer events were available. 

118  

119 Raises 

120 ------ 

121 :class:`cuda.core.system.TimeoutError` 

122 If the timeout expires before an event is received. 

123 :class:`cuda.core.system.GpuIsLostError` 

124 If the GPU has fallen off the bus or is otherwise inaccessible. 

125 """ 

126 return SystemEvents(nvml.system_event_set_wait(self._event_set, timeout_ms, buffer_size)) 1a

127  

128  

129def register_events(events: SystemEventType | int | list[SystemEventType | int]) -> RegisteredSystemEvents: 

130 """ 

131 Starts recording of events on test system. 

132  

133 For Linux only. 

134  

135 All events that occurred before this call are not recorded. Wait for events 

136 using the :meth:`RegisteredSystemEvents.wait` method on the result. 

137  

138 Examples 

139 -------- 

140 >>> from cuda.core import system 

141 >>> events = system.register_events([ 

142 ... SystemEventType.SYSTEM_EVENT_TYPE_GPU_DRIVER_UNBIND, 

143 ... ]) 

144 >>> while event := events.wait(timeout_ms=10000): 

145 ... print(f"Event {event.event_type} occurred.") 

146  

147 Parameters 

148 ---------- 

149 events: SystemEventType, int, or list of SystemEventType or int 

150 The event type or list of event types to register for this device. 

151  

152 Returns 

153 ------- 

154 :obj:`~_system_events.RegisteredSystemEvents` 

155 An object representing the registered events. Call 

156 :meth:`~_system_events.RegisteredSystemEvents.wait` on this object to wait for events. 

157  

158 Raises 

159 ------ 

160 :class:`cuda.core.system.NotSupportedError` 

161 None of the requested event types are registered. 

162 """ 

163 return RegisteredSystemEvents(events) 1a

164  

165  

166__all__ = [ 

167 "register_events", 

168 "SystemEventType", 

169]