Coverage for cuda / core / system / _temperature.pxi: 14.63%

82 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-22 01:37 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5  

6_TEMPERATURE_THRESHOLD_MAPPING = { 

7 TemperatureThresholds.SHUTDOWN: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_SHUTDOWN, 

8 TemperatureThresholds.SLOWDOWN: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_SLOWDOWN, 

9 TemperatureThresholds.MEM_MAX: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_MEM_MAX, 

10 TemperatureThresholds.GPU_MAX: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_GPU_MAX, 

11 TemperatureThresholds.ACOUSTIC_MIN: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_ACOUSTIC_MIN, 

12 TemperatureThresholds.ACOUSTIC_CURR: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_ACOUSTIC_CURR, 

13 TemperatureThresholds.ACOUSTIC_MAX: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_ACOUSTIC_MAX, 

14 TemperatureThresholds.GPS_CURR: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_GPS_CURR, 

15} 

16  

17  

18_THERMAL_CONTROLLER_MAPPING = { 

19 nvml.ThermalController.GPU_INTERNAL: ThermalController.GPU_INTERNAL, 

20 nvml.ThermalController.ADM1032: ThermalController.ADM1032, 

21 nvml.ThermalController.ADT7461: ThermalController.ADT7461, 

22 nvml.ThermalController.MAX6649: ThermalController.MAX6649, 

23 nvml.ThermalController.MAX1617: ThermalController.MAX1617, 

24 nvml.ThermalController.LM99: ThermalController.LM99, 

25 nvml.ThermalController.LM89: ThermalController.LM89, 

26 nvml.ThermalController.LM64: ThermalController.LM64, 

27 nvml.ThermalController.G781: ThermalController.G781, 

28 nvml.ThermalController.ADT7473: ThermalController.ADT7473, 

29 nvml.ThermalController.SBMAX6649: ThermalController.SBMAX6649, 

30 nvml.ThermalController.VBIOSEVT: ThermalController.VBIOSEVT, 

31 nvml.ThermalController.OS: ThermalController.OS, 

32 nvml.ThermalController.NVSYSCON_CANOAS: ThermalController.NVSYSCON_CANOAS, 

33 nvml.ThermalController.NVSYSCON_E551: ThermalController.NVSYSCON_E551, 

34 nvml.ThermalController.MAX6649R: ThermalController.MAX6649R, 

35 nvml.ThermalController.ADT7473S: ThermalController.ADT7473S, 

36 nvml.ThermalController.UNKNOWN: ThermalController.UNKNOWN, 

37} 

38  

39  

40_THERMAL_TARGET_MAPPING = { 

41 nvml.ThermalTarget.NONE: ThermalTarget.NONE, 

42 nvml.ThermalTarget.GPU: ThermalTarget.GPU, 

43 nvml.ThermalTarget.MEMORY: ThermalTarget.MEMORY, 

44 nvml.ThermalTarget.POWER_SUPPLY: ThermalTarget.POWER_SUPPLY, 

45 nvml.ThermalTarget.BOARD: ThermalTarget.BOARD, 

46 nvml.ThermalTarget.VCD_BOARD: ThermalTarget.VCD_BOARD, 

47 nvml.ThermalTarget.VCD_INLET: ThermalTarget.VCD_INLET, 

48 nvml.ThermalTarget.VCD_OUTLET: ThermalTarget.VCD_OUTLET, 

49 nvml.ThermalTarget.ALL: ThermalTarget.ALL, 

50} 

51  

52  

53_THERMAL_TARGET_INV_MAPPING = {v: k for k, v in _THERMAL_TARGET_MAPPING.items()} 

54  

55  

56# In cuda.bindings.nvml, this is an anonymous struct inside nvmlThermalSettings_t. 

57  

58  

59ctypedef struct _ThermalSensor: 

60 int controller 

61 int defaultMinTemp 

62 int defaultMaxTemp 

63 int currentTemp 

64 int target 

65  

66  

67cdef class ThermalSensor: 

68 cdef: 

69 _ThermalSensor *_ptr 

70 object _owner 

71  

72 def __init__(self, ptr: int, owner: object): 

73 # ptr points to a part of the numpy buffer held by `_owner`, so we need 

74 # to maintain a reference to `_owner` to keep it alive. 

75 self._ptr = <_ThermalSensor *><intptr_t>ptr 

76 self._owner = owner 

77  

78 @property 

79 def controller(self) -> ThermalController: 

80 return _THERMAL_CONTROLLER_MAPPING.get(self._ptr[0].controller, ThermalController.UNKNOWN) 

81  

82 @property 

83 def default_min_temp(self) -> int: 

84 return self._ptr[0].defaultMinTemp 

85  

86 @property 

87 def default_max_temp(self) -> int: 

88 return self._ptr[0].defaultMaxTemp 

89  

90 @property 

91 def current_temp(self) -> int: 

92 return self._ptr[0].currentTemp 

93  

94 @property 

95 def target(self) -> ThermalTarget: 

96 return _THERMAL_TARGET_MAPPING.get(self._ptr[0].target, ThermalTarget.NONE) 

97  

98  

99cdef class ThermalSettings: 

100 cdef object _thermal_settings 

101  

102 def __init__(self, thermal_settings: nvml.ThermalSettings): 

103 self._thermal_settings = thermal_settings 

104  

105 def __len__(self): 

106 # MAX_THERMAL_SENSORS_PER_GPU is 3 

107 return min(self._thermal_settings.count, 3) 

108  

109 def __getitem__(self, idx: int) -> nvml.ThermalSensor: 

110 if idx < 0 or idx >= len(self): 

111 raise IndexError("Thermal sensor index out of range") 

112 return ThermalSensor( 

113 self._thermal_settings.sensor.ptr + idx * sizeof(_ThermalSensor), 

114 self._thermal_settings 

115 ) 

116  

117  

118cdef class Temperature: 

119 cdef intptr_t _handle 

120  

121 def __init__(self, handle: int): 

122 self._handle = handle 1a

123  

124 def get_sensor(self) -> int: 

125 """ 

126 Get the temperature reading from a specific sensor on the device, in 

127 degrees Celsius. 

128  

129 The only sensor currently supported is the GPU temperature sensor. 

130  

131 Returns 

132 ------- 

133 int 

134 The temperature in degrees Celsius. 

135 """ 

136 # NOTE: nvml.device_get_temperature_v takes a sensor type from the 

137 # TemperatorSensors enum, but there is only one value in that enum. For 

138 # future compatibility if there are other values for that enum, this is 

139 # a method, not a property 

140 return nvml.device_get_temperature_v(self._handle, nvml.TemperatureSensors.TEMPERATURE_GPU) 1a

141  

142 def get_threshold(self, threshold_type: TemperatureThresholds | str) -> int: 

143 """ 

144 Retrieves the temperature threshold for this GPU with the specified 

145 threshold type, in degrees Celsius. 

146  

147 For Kepler™ or newer fully supported devices. 

148  

149 See :class:`TemperatureThresholds` for possible threshold types. 

150  

151 Note: This API is no longer the preferred interface for retrieving the 

152 following temperature thresholds on Ada and later architectures: 

153 ``NVML_TEMPERATURE_THRESHOLD_SHUTDOWN``, 

154 ``NVML_TEMPERATURE_THRESHOLD_SLOWDOWN``, 

155 ``NVML_TEMPERATURE_THRESHOLD_MEM_MAX`` and 

156 ``NVML_TEMPERATURE_THRESHOLD_GPU_MAX``. 

157  

158 Support for reading these temperature thresholds for Ada and later 

159 architectures would be removed from this API in future releases. Please 

160 use :meth:`get_field_values` with ``NVML_FI_DEV_TEMPERATURE_*`` fields 

161 to retrieve temperature thresholds on these architectures. 

162 """ 

163 try: 1a

164 threshold_type_enum = _TEMPERATURE_THRESHOLD_MAPPING[threshold_type] 1a

165 except KeyError: 

166 raise ValueError( 

167 f"Invalid temperature threshold type: {threshold_type}. " 

168 f"Must be one of {list(TemperatureThresholds.__members__.values())}" 

169 ) from None 

170 if threshold_type_enum in ( 1a

171 nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_SHUTDOWN, 1a

172 nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_SLOWDOWN, 1a

173 nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_MEM_MAX, 1a

174 nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_GPU_MAX 1a

175 ): 

176 device_arch = nvml.DeviceArch(nvml.device_get_architecture(self._handle)) 1a

177 if device_arch >= nvml.DeviceArch.ADA: 1a

178 warnings.warn( 

179 f"{threshold_type} is no longer recommended for Ada and later architectures. " 

180 "Use get_field_values with NVML_FI_DEV_TEMPERATURE_* fields to retrieve this " 

181 "threshold on these architectures.", 

182 DeprecationWarning, 

183 stacklevel=2 

184 ) 

185 return nvml.device_get_temperature_threshold(self._handle, threshold_type_enum) 1a

186  

187 @property 

188 def margin(self) -> int: 

189 """ 

190 The thermal margin temperature (distance to nearest slowdown threshold) for the device. 

191 """ 

192 return nvml.device_get_margin_temperature(self._handle) 

193  

194 def get_thermal_settings(self, sensor_index: ThermalTarget | str) -> ThermalSettings: 

195 """ 

196 Used to execute a list of thermal system instructions. 

197  

198 Parameters 

199 ---------- 

200 sensor_index: ThermalTarget 

201 The index of the thermal sensor. 

202  

203 Returns 

204 ------- 

205 :obj:`~_device.ThermalSettings` 

206 The thermal settings for the specified sensor. 

207 """ 

208 # TODO: The above docstring is from the NVML header, but it doesn't seem to make sense. 

209 try: 

210 sensor_index_enum = _THERMAL_TARGET_INV_MAPPING[sensor_index] 

211 except KeyError: 

212 raise ValueError( 

213 f"Invalid thermal sensor index: {sensor_index}. " 

214 f"Must be one of {list(ThermalTarget.__members__.values())}" 

215 ) from None 

216  

217 return ThermalSettings(nvml.device_get_thermal_settings(self._handle, sensor_index_enum))