Coverage for cuda / core / system / _event.pxi: 35.71%

28 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-08 01:07 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5  

6EventType = nvml.EventType 

7  

8  

9cdef class EventData: 

10 """ 

11 Data about a single event. 

12 """ 

13 def __init__(self, event_data: nvml.EventData): 

14 self._event_data = event_data 

15  

16 @property 

17 def device(self) -> Device: 

18 """ 

19 The device on which the event occurred. 

20 """ 

21 device = Device.__new__() 

22 device._handle = self._event_data.device 

23 return device 

24  

25 @property 

26 def event_type(self) -> EventType: 

27 """ 

28 The type of event that was triggered. 

29 """ 

30 return EventType(self._event_data.event_type) 

31  

32 @property 

33 def event_data(self) -> int: 

34 """ 

35 Returns Xid error for the device in the event of 

36 :member:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. 

37  

38 Raises :class:`ValueError` for other event types. 

39 """ 

40 if self.event_type != EventType.EVENT_TYPE_XID_CRITICAL_ERROR: 

41 raise ValueError("event_data is only available for Xid critical error events.") 

42 return self._event_data.event_data 

43  

44 @property 

45 def gpu_instance_id(self) -> int: 

46 """ 

47 The GPU instance ID for MIG devices. 

48  

49 Only valid for events of type :attr:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. 

50  

51 Raises :class:`ValueError` for other event types. 

52 """ 

53 if self.event_type != EventType.EVENT_TYPE_XID_CRITICAL_ERROR: 

54 raise ValueError("gpu_instance_id is only available for Xid critical error events.") 

55 return self._event_data.gpu_instance_id 

56  

57 @property 

58 def compute_instance_id(self) -> int: 

59 """ 

60 The Compute instance ID for MIG devices. 

61  

62 Only valid for events of type :attr:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. 

63  

64 Raises :class:`ValueError` for other event types. 

65 """ 

66 if self.event_type != EventType.EVENT_TYPE_XID_CRITICAL_ERROR: 

67 raise ValueError("compute_instance_id is only available for Xid critical error events.") 

68 return self._event_data.compute_instance_id 

69  

70  

71cdef class DeviceEvents: 

72 """ 

73 Represents a set of events that can be waited on for a specific device. 

74 """ 

75 cdef intptr_t _event_set 

76 cdef intptr_t _device_handle 

77  

78 def __init__(self, device_handle: intptr_t, events: EventType | int | list[EventType | int]): 

79 cdef unsigned long long event_bitmask 

80 if isinstance(events, (int, EventType)): 1a

81 event_bitmask = <unsigned long long>int(events) 1a

82 elif isinstance(events, list): 1a

83 event_bitmask = 0 1a

84 for ev in events: 1a

85 event_bitmask |= <unsigned long long>int(ev) 

86 else: 

87 raise TypeError("events must be an EventType, int, or list of EventType or int") 

88  

89 self._device_handle = device_handle 1a

90 self._event_set = nvml.event_set_create() 1a

91 # If this raises, the event needs to be freed and this is handled by 

92 # this class's __dealloc__ method. 

93 nvml.device_register_events(self._device_handle, event_bitmask, self._event_set) 1a

94  

95 def __dealloc__(self): 

96 nvml.event_set_free(self._event_set) 1a

97  

98 def wait(self, timeout_ms: int = 0) -> EventData: 

99 """ 

100 Wait for events in the event set. 

101  

102 For Fermi™ or newer fully supported devices. 

103  

104 If some events are ready to be delivered at the time of the call, 

105 function returns immediately. If there are no events ready to be 

106 delivered, function sleeps until event arrives but not longer than 

107 specified timeout. If timeout passes, a 

108 :class:`cuda.core.system.TimeoutError` is raised. This function in 

109 certain conditions can return before specified timeout passes (e.g. when 

110 interrupt arrives). 

111  

112 On Windows, in case of Xid error, the function returns the most recent 

113 Xid error type seen by the system. If there are multiple Xid errors 

114 generated before ``wait`` is invoked, then the last seen Xid 

115 error type is returned for all Xid error events. 

116  

117 On Linux, every Xid error event would return the associated event data 

118 and other information if applicable. 

119  

120 In MIG mode, if device handle is provided, the API reports all the 

121 events for the available instances, only if the caller has appropriate 

122 privileges. In absence of required privileges, only the events which 

123 affect all the instances (i.e. whole device) are reported. 

124  

125 This API does not currently support per-instance event reporting using 

126 MIG device handles. 

127  

128 Parameters 

129 ---------- 

130 timeout_ms: int 

131 The timeout in milliseconds. A value of 0 means to wait indefinitely. 

132  

133 Raises 

134 ------ 

135 :class:`cuda.core.system.TimeoutError` 

136 If the timeout expires before an event is received. 

137 :class:`cuda.core.system.GpuIsLostError` 

138 If the GPU has fallen off the bus or is otherwise inaccessible. 

139 """ 

140 return EventData(nvml.event_set_wait_v2(self._event_set, timeout_ms)) 1a