Coverage for cuda / core / system / _event.pxi: 28.85%

52 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-22 01:37 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5  

6_EVENT_TYPE_MAPPING = { 

7 nvml.EventType.NONE: EventType.NONE, 

8 nvml.EventType.SINGLE_BIT_ECC_ERROR: EventType.SINGLE_BIT_ECC_ERROR, 

9 nvml.EventType.DOUBLE_BIT_ECC_ERROR: EventType.DOUBLE_BIT_ECC_ERROR, 

10 nvml.EventType.PSTATE: EventType.PSTATE, 

11 nvml.EventType.XID_CRITICAL_ERROR: EventType.XID_CRITICAL_ERROR, 

12 nvml.EventType.CLOCK: EventType.CLOCK, 

13 nvml.EventType.POWER_SOURCE_CHANGE: EventType.POWER_SOURCE_CHANGE, 

14 nvml.EventType.MIG_CONFIG_CHANGE: EventType.MIG_CONFIG_CHANGE, 

15 nvml.EventType.SINGLE_BIT_ECC_ERROR_STORM: EventType.SINGLE_BIT_ECC_ERROR_STORM, 

16 nvml.EventType.DRAM_RETIREMENT_EVENT: EventType.DRAM_RETIREMENT_EVENT, 

17 nvml.EventType.DRAM_RETIREMENT_FAILURE: EventType.DRAM_RETIREMENT_FAILURE, 

18 nvml.EventType.NON_FATAL_POISON_ERROR: EventType.NON_FATAL_POISON_ERROR, 

19 nvml.EventType.FATAL_POISON_ERROR: EventType.FATAL_POISON_ERROR, 

20 nvml.EventType.GPU_UNAVAILABLE_ERROR: EventType.GPU_UNAVAILABLE_ERROR, 

21 nvml.EventType.GPU_RECOVERY_ACTION: EventType.GPU_RECOVERY_ACTION, 

22} 

23  

24  

25_EVENT_TYPE_INV_MAPPING = {v: k for k, v in _EVENT_TYPE_MAPPING.items()} 

26  

27  

28cdef class EventData: 

29 """ 

30 Data about a single event. 

31 """ 

32 def __init__(self, event_data: nvml.EventData): 

33 self._event_data = event_data 

34  

35 @property 

36 def device(self) -> Device: 

37 """ 

38 The device on which the event occurred. 

39 """ 

40 device = Device.__new__(Device) 

41 device._handle = self._event_data.device 

42 return device 

43  

44 @property 

45 def event_type(self) -> EventType: 

46 """ 

47 The type of event that was triggered. 

48 """ 

49 return _EVENT_TYPE_MAPPING[self._event_data.event_type] 

50  

51 @property 

52 def event_data(self) -> int: 

53 """ 

54 Returns Xid error for the device in the event of 

55 :attr:`~cuda.core.system.EventType.XID_CRITICAL_ERROR`. 

56  

57 Raises :class:`ValueError` for other event types. 

58 """ 

59 if self._event_data.event_type != nvml.EventType.XID_CRITICAL_ERROR: 

60 raise ValueError("event_data is only available for Xid critical error events.") 

61 return self._event_data.event_data 

62  

63 @property 

64 def gpu_instance_id(self) -> int: 

65 """ 

66 The GPU instance ID for MIG devices. 

67  

68 Only valid for events of type :attr:`EventType.XID_CRITICAL_ERROR`. 

69  

70 Raises :class:`ValueError` for other event types. 

71 """ 

72 if self._event_data.event_type != nvml.EventType.XID_CRITICAL_ERROR: 

73 raise ValueError("gpu_instance_id is only available for Xid critical error events.") 

74 return self._event_data.gpu_instance_id 

75  

76 @property 

77 def compute_instance_id(self) -> int: 

78 """ 

79 The Compute instance ID for MIG devices. 

80  

81 Only valid for events of type :attr:`EventType.XID_CRITICAL_ERROR`. 

82  

83 Raises :class:`ValueError` for other event types. 

84 """ 

85 if self._event_data.event_type != nvml.EventType.XID_CRITICAL_ERROR: 

86 raise ValueError("compute_instance_id is only available for Xid critical error events.") 

87 return self._event_data.compute_instance_id 

88  

89  

90cdef class DeviceEvents: 

91 """ 

92 Represents a set of events that can be waited on for a specific device. 

93 """ 

94 cdef intptr_t _event_set 

95 cdef intptr_t _device_handle 

96  

97 def __init__(self, device_handle: intptr_t, events: EventType | str | list[EventType | str]): 

98 self._event_set = 0 1a

99  

100 cdef unsigned long long event_bitmask 

101 if isinstance(events, (str, EventType)): 1a

102 events = [events] 

103  

104 if isinstance(events, list): 1a

105 event_bitmask = 0 1a

106 for ev in events: 1a

107 try: 1a

108 ev_enum = _EVENT_TYPE_INV_MAPPING[ev] 1a

109 except KeyError: 

110 raise ValueError( 

111 f"Invalid event type: {ev}. " 

112 f"Must be one of {list(EventType.__members__.values())}" 

113 ) from None 

114 event_bitmask |= <unsigned long long>int(ev_enum) 1a

115 else: 

116 raise TypeError("events must be an EventType, str, or list of EventType or str") 1a

117  

118 self._device_handle = device_handle 1a

119 self._event_set = nvml.event_set_create() 1a

120 # If this raises, the event needs to be freed and this is handled by 

121 # this class's __dealloc__ method. 

122 nvml.device_register_events(self._device_handle, event_bitmask, self._event_set) 1a

123  

124 def __dealloc__(self): 

125 if self._event_set != 0: 1a

126 nvml.event_set_free(self._event_set) 1a

127  

128 def wait(self, timeout_ms: int = 0) -> EventData: 

129 """ 

130 Wait for events in the event set. 

131  

132 For Fermi™ or newer fully supported devices. 

133  

134 If some events are ready to be delivered at the time of the call, 

135 function returns immediately. If there are no events ready to be 

136 delivered, function sleeps until event arrives but not longer than 

137 specified timeout. If timeout passes, a 

138 :class:`cuda.core.system.TimeoutError` is raised. This function in 

139 certain conditions can return before specified timeout passes (e.g. when 

140 interrupt arrives). 

141  

142 On Windows, in case of Xid error, the function returns the most recent 

143 Xid error type seen by the system. If there are multiple Xid errors 

144 generated before ``wait`` is invoked, then the last seen Xid 

145 error type is returned for all Xid error events. 

146  

147 On Linux, every Xid error event would return the associated event data 

148 and other information if applicable. 

149  

150 In MIG mode, if device handle is provided, the API reports all the 

151 events for the available instances, only if the caller has appropriate 

152 privileges. In absence of required privileges, only the events which 

153 affect all the instances (i.e. whole device) are reported. 

154  

155 This API does not currently support per-instance event reporting using 

156 MIG device handles. 

157  

158 Parameters 

159 ---------- 

160 timeout_ms: int 

161 The timeout in milliseconds. A value of 0 means to wait indefinitely. 

162  

163 Raises 

164 ------ 

165 :class:`cuda.core.system.TimeoutError` 

166 If the timeout expires before an event is received. 

167 :class:`cuda.core.system.GpuIsLostError` 

168 If the GPU has fallen off the bus or is otherwise inaccessible. 

169 """ 

170 return EventData(nvml.event_set_wait_v2(self._event_set, timeout_ms)) 1a