Coverage for cuda / core / system / _device.pyx: 75.29%

255 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-22 01:37 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from libc.stdint cimport intptr_t, uint64_t 

6from libc.math cimport ceil 

7  

8from multiprocessing import cpu_count 

9from typing import Iterable 

10import warnings 

11  

12from cuda.bindings import nvml 

13  

14from ._nvml_context cimport initialize 

15from cuda.core.system.typing import ( 

16 AddressingMode, 

17 AffinityScope, 

18 DeviceArch, 

19 ClockId, 

20 ClocksEventReasons, 

21 ClockType, 

22 CoolerControl, 

23 CoolerTarget, 

24 DeviceArch, 

25 EventType, 

26 FanControlPolicy, 

27 FieldId, 

28 GpuP2PCapsIndex, 

29 GpuP2PStatus, 

30 GpuTopologyLevel, 

31 InforomObject, 

32 TemperatureThresholds, 

33 ThermalController, 

34 ThermalTarget, 

35) 

36  

37  

38cdef object _pstate_to_int(object pstate): 

39 if pstate == nvml.Pstates.PSTATE_UNKNOWN: 1aec

40 return None 1c

41 assert ( 1aec

42 int(pstate) >= 0 and int(pstate) <= 15 1aec

43 ), f"Invalid P-state: {pstate}. Must be between 0 and 15 inclusive, or PSTATE_UNKNOWN." 

44 return int(pstate) - int(nvml.Pstates.PSTATE_0) 1aec

45  

46  

47cdef int _pstate_to_enum(int pstate): 

48 if pstate < 0 or pstate > 15: 1ae

49 raise ValueError(f"Invalid P-state: {pstate}. Must be between 0 and 15 inclusive.") 

50 return int(pstate) + int(nvml.Pstates.PSTATE_0) 1e

51  

52  

53include "_clock.pxi" 

54include "_cooler.pxi" 

55include "_device_attributes.pxi" 

56include "_device_utils.pxi" 

57include "_event.pxi" 

58include "_fan.pxi" 

59include "_field_values.pxi" 

60include "_inforom.pxi" 

61include "_memory.pxi" 

62include "_mig.pxi" 

63include "_nvlink.pxi" 

64include "_pci_info.pxi" 

65include "_performance.pxi" 

66include "_process.pxi" 

67include "_repair_status.pxi" 

68include "_temperature.pxi" 

69include "_utilization.pxi" 

70  

71  

72_ADDRESSING_MODE_MAPPING = { 

73 nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_HMM: AddressingMode.HMM, 

74 nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_ATS: AddressingMode.ATS, 

75} 

76  

77  

78_AFFINITY_SCOPE_MAPPING = { 

79 AffinityScope.NODE: nvml.AffinityScope.NODE, 

80 AffinityScope.SOCKET: nvml.AffinityScope.SOCKET, 

81} 

82  

83  

84_BRAND_TYPE_MAPPING = { 

85 nvml.BrandType.BRAND_UNKNOWN: "Unknown", 

86 nvml.BrandType.BRAND_QUADRO: "Quadro", 

87 nvml.BrandType.BRAND_TESLA: "Tesla", 

88 nvml.BrandType.BRAND_NVS: "NVS", 

89 nvml.BrandType.BRAND_GRID: "GRID", 

90 nvml.BrandType.BRAND_GEFORCE: "GeForce", 

91 nvml.BrandType.BRAND_TITAN: "Titan", 

92 nvml.BrandType.BRAND_NVIDIA_VAPPS: "NVIDIA vApps", 

93 nvml.BrandType.BRAND_NVIDIA_VPC: "NVIDIA VPC", 

94 nvml.BrandType.BRAND_NVIDIA_VCS: "NVIDIA VCS", 

95 nvml.BrandType.BRAND_NVIDIA_VWS: "NVIDIA VWS", 

96 nvml.BrandType.BRAND_NVIDIA_CLOUD_GAMING: "NVIDIA Cloud Gaming", 

97 nvml.BrandType.BRAND_NVIDIA_VGAMING: "NVIDIA vGaming", 

98 nvml.BrandType.BRAND_QUADRO_RTX: "Quadro RTX", 

99 nvml.BrandType.BRAND_NVIDIA_RTX: "NVIDIA RTX", 

100 nvml.BrandType.BRAND_NVIDIA: "NVIDIA", 

101 nvml.BrandType.BRAND_GEFORCE_RTX: "GeForce RTX", 

102 nvml.BrandType.BRAND_TITAN_RTX: "Titan RTX", 

103} 

104  

105  

106_GPU_P2P_CAPS_INDEX_MAPPING = { 

107 GpuP2PCapsIndex.READ: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_READ, 

108 GpuP2PCapsIndex.WRITE: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_WRITE, 

109 GpuP2PCapsIndex.NVLINK: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_NVLINK, 

110 GpuP2PCapsIndex.ATOMICS: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_ATOMICS, 

111 GpuP2PCapsIndex.PCI: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PCI, 

112 GpuP2PCapsIndex.PROP: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PROP, 

113 GpuP2PCapsIndex.UNKNOWN: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_UNKNOWN, 

114} 

115  

116  

117_GPU_P2P_STATUS_MAPPING = { 

118 nvml.GpuP2PStatus.P2P_STATUS_OK: GpuP2PStatus.OK, 

119 nvml.GpuP2PStatus.P2P_STATUS_CHIPSET_NOT_SUPPORTED: GpuP2PStatus.CHIPSET_NOT_SUPPORTED, 

120 nvml.GpuP2PStatus.P2P_STATUS_GPU_NOT_SUPPORTED: GpuP2PStatus.GPU_NOT_SUPPORTED, 

121 nvml.GpuP2PStatus.P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED: GpuP2PStatus.IOH_TOPOLOGY_NOT_SUPPORTED, 

122 nvml.GpuP2PStatus.P2P_STATUS_DISABLED_BY_REGKEY: GpuP2PStatus.DISABLED_BY_REGKEY, 

123 nvml.GpuP2PStatus.P2P_STATUS_NOT_SUPPORTED: GpuP2PStatus.NOT_SUPPORTED, 

124 nvml.GpuP2PStatus.P2P_STATUS_UNKNOWN: GpuP2PStatus.UNKNOWN, 

125} 

126  

127  

128_GPU_TOPOLOGY_LEVEL_MAPPING = { 

129 GpuTopologyLevel.INTERNAL: nvml.GpuTopologyLevel.TOPOLOGY_INTERNAL, 

130 GpuTopologyLevel.SINGLE: nvml.GpuTopologyLevel.TOPOLOGY_SINGLE, 

131 GpuTopologyLevel.MULTIPLE: nvml.GpuTopologyLevel.TOPOLOGY_MULTIPLE, 

132 GpuTopologyLevel.HOSTBRIDGE: nvml.GpuTopologyLevel.TOPOLOGY_HOSTBRIDGE, 

133 GpuTopologyLevel.NODE: nvml.GpuTopologyLevel.TOPOLOGY_NODE, 

134 GpuTopologyLevel.SYSTEM: nvml.GpuTopologyLevel.TOPOLOGY_SYSTEM, 

135} 

136  

137  

138_GPU_TOPOLOGY_LEVEL_INV_MAPPING = {v: k for k, v in _GPU_TOPOLOGY_LEVEL_MAPPING.items()} 

139  

140  

141  

142cdef class Device: 

143 """ 

144 Representation of a device. 

145  

146 :class:`cuda.core.system.Device` provides access to various pieces of metadata 

147 about devices and their topology, as provided by the NVIDIA Management 

148 Library (NVML). To use CUDA with a device, use :class:`cuda.core.Device`. 

149  

150 Creating a device instance causes NVML to initialize the target GPU. 

151 NVML may initialize additional GPUs if the target GPU is an SLI slave. 

152  

153 Parameters 

154 ---------- 

155 index: int, optional 

156 Integer representing the CUDA device index to get a handle to. Valid 

157 values are between ``0`` and ``cuda.core.system.get_num_devices() - 1``. 

158  

159 The order in which devices are enumerated has no guarantees of 

160 consistency between reboots. For that reason, it is recommended that 

161 devices are looked up by their PCI ids or UUID. 

162  

163 uuid: bytes or str, optional 

164 UUID of a CUDA device to get a handle to. 

165  

166 pci_bus_id: bytes or str, optional 

167 PCI bus ID of a CUDA device to get a handle to. 

168  

169 Raises 

170 ------ 

171 ValueError 

172 If anything other than a single `index`, `uuid` or `pci_bus_id` are specified. 

173 """ 

174  

175 # This is made public for testing purposes only 

176 cdef public intptr_t _handle 

177  

178 def __init__( 

179 self, 

180 *, 

181 index: int | None = None, 

182 uuid: bytes | str | None = None, 

183 pci_bus_id: bytes | str | None = None, 

184 ): 

185 args = [index, uuid, pci_bus_id] 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

186 cdef int arg_count = sum(arg is not None for arg in args) 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

187  

188 if arg_count > 1: 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

189 raise ValueError("Handle requires only one of `index`, `uuid`, or `pci_bus_id`.") 

190 if arg_count == 0: 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

191 raise ValueError("Handle requires either a device `index`, `uuid`, or `pci_bus_id`.") 

192  

193 initialize() 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

194  

195 if index is not None: 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

196 self._handle = nvml.device_get_handle_by_index_v2(index) 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ

197 elif uuid is not None: 1fp

198 if isinstance(uuid, bytes): 1p

199 uuid = uuid.decode("ascii") 

200 self._handle = nvml.device_get_handle_by_uuid(uuid) 1p

201 elif pci_bus_id is not None: 1f

202 if isinstance(pci_bus_id, bytes): 1f

203 pci_bus_id = pci_bus_id.decode("ascii") 

204 self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) 1f

205  

206 ######################################################################### 

207 # BASIC PROPERTIES 

208  

209 @property 

210 def index(self) -> int: 

211 """ 

212 The NVML index of this device. 

213  

214 Valid indices are derived from the count returned by 

215 :meth:`Device.get_device_count`. For example, if ``get_device_count()`` 

216 returns 2, the valid indices are 0 and 1, corresponding to GPU 0 and GPU 

217 1. 

218  

219 The order in which NVML enumerates devices has no guarantees of 

220 consistency between reboots. For that reason, it is recommended that 

221 devices be looked up by their PCI ids or GPU UUID. 

222  

223 Note: The NVML index may not correlate with other APIs, such as the CUDA 

224 device index. 

225 """ 

226 return nvml.device_get_index(self._handle) 1fI

227  

228 @property 

229 def uuid(self) -> str: 

230 """ 

231 Retrieves the globally unique immutable UUID associated with this 

232 device, as a 5 part hexadecimal string, that augments the immutable, 

233 board serial identifier. 

234  

235 In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-`` 

236 prefix. If you need a `uuid` without that prefix (for example, to 

237 interact with CUDA), use the `uuid_without_prefix` property. 

238 """ 

239 return nvml.device_get_uuid(self._handle) 1P

240  

241 @property 

242 def uuid_without_prefix(self) -> str: 

243 """ 

244 Retrieves the globally unique immutable UUID associated with this 

245 device, as a 5 part hexadecimal string, that augments the immutable, 

246 board serial identifier. 

247  

248 In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-`` 

249 prefix. This property returns it without the prefix, to match the UUIDs 

250 used in CUDA. If you need the prefix, use the `uuid` property. 

251 """ 

252 # NVML UUIDs have a `gpu-` or `mig-` prefix. We remove that here. 

253 return nvml.device_get_uuid(self._handle)[4:] 1Ehp

254  

255 @property 

256 def pci_bus_id(self) -> str: 

257 """ 

258 Retrieves the PCI bus ID of this device. 

259 """ 

260 return self.pci_info.bus_id 

261  

262 @property 

263 def numa_node_id(self) -> int: 

264 """ 

265 The NUMA node of the given GPU device. 

266  

267 This only applies to platforms where the GPUs are NUMA nodes. 

268 """ 

269 return nvml.device_get_numa_node_id(self._handle) 1L

270  

271 @property 

272 def arch(self) -> DeviceArch: 

273 """ 

274 :obj:`~DeviceArch` device architecture. 

275  

276 For example, a Tesla V100 will report ``DeviceArchitecture.name == 

277 "VOLTA"``, and RTX A6000 will report ``DeviceArchitecture.name == 

278 "AMPERE"``. 

279 """ 

280 arch = nvml.device_get_architecture(self._handle) 1ln

281 try: 1ln

282 return DeviceArch(arch) 1ln

283 except ValueError: 

284 return DeviceArch.UNKNOWN 

285  

286 @property 

287 def name(self) -> str: 

288 """ 

289 Name of the device, e.g.: `"Tesla V100-SXM2-32GB"` 

290 """ 

291 return nvml.device_get_name(self._handle) 1C

292  

293 @property 

294 def brand(self) -> str: 

295 """ 

296 The brand of the device. 

297  

298 Returns "Unknown" if the brand is unknown. 

299 """ 

300 return _BRAND_TYPE_MAPPING.get(nvml.device_get_brand(self._handle), "Unknown") 1z

301  

302 @property 

303 def serial(self) -> str: 

304 """ 

305 Retrieves the globally unique board serial number associated with this 

306 device's board. 

307  

308 For all products with an InfoROM. 

309 """ 

310 return nvml.device_get_serial(self._handle) 1D

311  

312 @property 

313 def module_id(self) -> int: 

314 """ 

315 Get a unique identifier for the device module on the baseboard. 

316  

317 This API retrieves a unique identifier for each GPU module that exists 

318 on a given baseboard. For non-baseboard products, this ID would always 

319 be 0. 

320 """ 

321 return nvml.device_get_module_id(self._handle) 1K

322  

323 @property 

324 def minor_number(self) -> int: 

325 """ 

326 The minor number of this device. 

327  

328 For Linux only. 

329  

330 The minor number is used by the Linux device driver to identify the 

331 device node in ``/dev/nvidiaX``. 

332 """ 

333 return nvml.device_get_minor_number(self._handle) 1H

334  

335 @property 

336 def is_c2c_enabled(self) -> bool: 

337 """ 

338 Whether the C2C (Chip-to-Chip) mode is enabled for this device. 

339 """ 

340 return bool(nvml.device_get_c2c_mode_info_v(self._handle).is_c2c_enabled) 1v

341  

342 @property 

343 def is_persistence_mode_enabled(self) -> bool: 

344 """ 

345 Whether persistence mode is enabled for this device. 

346  

347 For Linux only. 

348 """ 

349 return nvml.device_get_persistence_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED 1k

350  

351 @is_persistence_mode_enabled.setter 

352 def is_persistence_mode_enabled(self, enabled: bool) -> None: 

353 nvml.device_set_persistence_mode( 1k

354 self._handle, 1k

355 nvml.EnableState.FEATURE_ENABLED if enabled else nvml.EnableState.FEATURE_DISABLED 1k

356 ) 

357  

358 @property 

359 def cuda_compute_capability(self) -> tuple[int, int]: 

360 """ 

361 CUDA compute capability of the device, e.g.: `(7, 0)` for a Tesla V100. 

362  

363 Returns a tuple `(major, minor)`. 

364 """ 

365 return nvml.device_get_cuda_compute_capability(self._handle) 1A

366  

367 def to_cuda_device(self) -> "cuda.core.Device": 

368 """ 

369 Get the corresponding :class:`cuda.core.Device` (which is used for CUDA 

370 access) for this :class:`cuda.core.system.Device` (which is used for 

371 NVIDIA machine library (NVML) access). 

372  

373 The devices are mapped to one another by their UUID. 

374  

375 Returns 

376 ------- 

377 cuda.core.Device 

378 The corresponding CUDA device. 

379  

380 Raises 

381 ------ 

382 RuntimeError 

383 No corresponding CUDA device is found for this NVML device. 

384  

385 For example, on a MIG system, the physical GPU will not have an 

386 available CUDA device, since it can not be used directly, even 

387 though it can be enumerated from NVML. 

388 """ 

389 from cuda.core import Device as CudaDevice 1h

390  

391 # CUDA does not have an API to get a device by its UUID, so we just 

392 # search all the devices for one with a matching UUID. 

393  

394 for cuda_device in CudaDevice.get_all_devices(): 1h

395 if cuda_device.uuid == self.uuid_without_prefix: 1h

396 return cuda_device 1h

397  

398 raise RuntimeError("No corresponding CUDA device found for this NVML device.") 

399  

400 @classmethod 

401 def get_device_count(cls) -> int: 

402 """ 

403 Get the number of available devices. 

404  

405 Returns 

406 ------- 

407 int 

408 The number of available devices. 

409 """ 

410 initialize() 1STU

411  

412 return nvml.device_get_count_v2() 1STU

413  

414 @classmethod 

415 def get_all_devices(cls) -> Iterable[Device]: 

416 """ 

417 Query the available device instances. 

418  

419 Returns 

420 ------- 

421 Iterator over :obj:`~Device` 

422 An iterator over available devices. 

423 """ 

424 initialize() 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ

425  

426 for device_id in range(nvml.device_get_count_v2()): 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ

427 yield cls(index=device_id) 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ

428  

429 ######################################################################### 

430 # ADDRESSING MODE 

431  

432 @property 

433 def addressing_mode(self) -> AddressingMode | None: 

434 """ 

435 Get the :obj:`~AddressingMode` of the device. 

436 """ 

437 return _ADDRESSING_MODE_MAPPING.get(nvml.device_get_addressing_mode(self._handle).value, None) 1t

438  

439 ######################################################################### 

440 # MIG (MULTI-INSTANCE GPU) DEVICES 

441  

442 @property 

443 def mig(self) -> MigInfo: 

444 """ 

445 Get :obj:`~MigInfo` accessor for MIG (Multi-Instance GPU) information. 

446  

447 For Ampere™ or newer fully supported devices. 

448 """ 

449 return MigInfo(self) 1qJ

450  

451 ######################################################################### 

452 # AFFINITY 

453  

454 @classmethod 

455 def get_all_devices_with_cpu_affinity(cls, cpu_index: int) -> Iterable[Device]: 

456 """ 

457 Retrieve the set of GPUs that have a CPU affinity with the given CPU number. 

458  

459 Supported on Linux only. 

460  

461 Parameters 

462 ---------- 

463 cpu_index: int 

464 The CPU index. 

465  

466 Returns 

467 ------- 

468 Iterator of :obj:`~Device` 

469 An iterator over available devices. 

470 """ 

471 cdef Device device 

472 for handle in nvml.system_get_topology_gpu_set(cpu_index): 1R

473 device = Device.__new__(Device) 1R

474 device._handle = handle 1R

475 yield device 1R

476  

477 def get_memory_affinity(self, scope: AffinityScope | str=AffinityScope.NODE) -> list[int]: 

478 """ 

479 Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal 

480 memory affinity for the device. 

481  

482 For Kepler™ or newer fully supported devices. 

483  

484 Supported on Linux only. 

485  

486 If requested scope is not applicable to the target topology, the API 

487 will fall back to reporting the memory affinity for the immediate non-I/O 

488 ancestor of the device. 

489  

490 Parameters 

491 ---------- 

492 scope: AffinityScope | str, optional 

493 The scope of the affinity query. Must be one of the values of 

494 :class:`AffinityScope`. Default is :attr:`AffinityScope.NODE`. 

495  

496 Returns 

497 ------- 

498 list[int] 

499 A list of indices of NUMA nodes or CPU sockets with the ideal memory 

500 affinity for the device. 

501 """ 

502 try: 1b

503 scope = _AFFINITY_SCOPE_MAPPING[scope] 1b

504 except KeyError: 

505 raise ValueError( 

506 f"Invalid affinity scope: {scope}. " 

507 f"Must be one of {list(AffinityScope.__members__.values())}" 

508 ) from None 

509 return _unpack_bitmask( 1b

510 nvml.device_get_memory_affinity( 1b

511 self._handle, 1b

512 <unsigned int>ceil(cpu_count() / 64), 1b

513 scope, 1b

514 ) 

515 ) 

516  

517 def get_cpu_affinity(self, scope: AffinityScope | str=AffinityScope.NODE) -> list[int]: 

518 """ 

519 Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal 

520 CPU affinity for the device. 

521  

522 For Kepler™ or newer fully supported devices. 

523  

524 Supported on Linux only. 

525  

526 If requested scope is not applicable to the target topology, the API 

527 will fall back to reporting the memory affinity for the immediate non-I/O 

528 ancestor of the device. 

529  

530 Parameters 

531 ---------- 

532 scope: AffinityScope | str, optional 

533 The scope of the affinity query. Must be one of the values of 

534 :class:`AffinityScope`. Default is :attr:`AffinityScope.NODE`. 

535  

536 Returns 

537 ------- 

538 list[int] 

539 A list of indices of NUMA nodes or CPU sockets with the ideal memory 

540 affinity for the device. 

541 """ 

542 try: 1biR

543 scope = _AFFINITY_SCOPE_MAPPING[scope] 1biR

544 except KeyError: 

545 raise ValueError( 

546 f"Invalid affinity scope: {scope}. " 

547 f"Must be one of {list(AffinityScope.__members__.values())}" 

548 ) from None 

549 return _unpack_bitmask( 1biR

550 nvml.device_get_cpu_affinity_within_scope( 1biR

551 self._handle, 1biR

552 <unsigned int>ceil(cpu_count() / 64), 1biR

553 scope, 1biR

554 ) 

555 ) 

556  

557 def set_cpu_affinity(self): 

558 """ 

559 Sets the ideal affinity for the calling thread and device. 

560  

561 For Kepler™ or newer fully supported devices. 

562  

563 Supported on Linux only. 

564 """ 

565 nvml.device_set_cpu_affinity(self._handle) 

566  

567 def clear_cpu_affinity(self): 

568 """ 

569 Clear all affinity bindings for the calling thread. 

570  

571 For Kepler™ or newer fully supported devices. 

572  

573 Supported on Linux only. 

574 """ 

575 nvml.device_clear_cpu_affinity(self._handle) 

576  

577 ######################################################################### 

578 # CLOCK 

579 # See external class definitions in _clock.pxi 

580  

581 def get_clock(self, clock_type: ClockType | str) -> ClockInfo: 

582 """ 

583 :obj:`~_device.ClockInfo` object to get information about and manage a specific clock on a device. 

584 """ 

585 return ClockInfo(self._handle, clock_type) 1e

586  

587 @property 

588 def is_auto_boosted_clocks_enabled(self) -> tuple[bool, bool]: 

589 """ 

590 Retrieve the current state of auto boosted clocks on a device. 

591  

592 For Kepler™ or newer fully supported devices. 

593  

594 Auto Boosted clocks are enabled by default on some hardware, allowing 

595 the GPU to run at higher clock rates to maximize performance as thermal 

596 limits allow. 

597  

598 On Pascal™ and newer hardware, Auto Boosted clocks are controlled 

599 through application clocks. Use :meth:`set_application_clocks` and 

600 :meth:`reset_application_clocks` to control Auto Boost behavior. 

601  

602 Returns 

603 ------- 

604 bool 

605 The current state of Auto Boosted clocks 

606 bool 

607 The default Auto Boosted clocks behavior 

608  

609 """ 

610 current, default = nvml.device_get_auto_boosted_clocks_enabled(self._handle) 1u

611 return current == nvml.EnableState.FEATURE_ENABLED, default == nvml.EnableState.FEATURE_ENABLED 

612  

613 @property 

614 def current_clock_event_reasons(self) -> list[ClocksEventReasons]: 

615 """ 

616 Retrieves the current :obj:`~ClocksEventReasons`. 

617  

618 For all fully supported products. 

619 """ 

620 cdef uint64_t[1] reasons 

621 reasons[0] = nvml.device_get_current_clocks_event_reasons(self._handle) 1d

622 output_reasons = [] 1d

623 for reason in _unpack_bitmask(reasons): 1d

624 try: 

625 output_reason = _CLOCKS_EVENT_REASONS_MAPPING[1 << reason] 

626 except KeyError: 

627 raise ValueError(f"Unknown clock event reason bit: {1 << reason}") 

628 output_reasons.append(output_reason) 

629 return output_reasons 1d

630  

631 @property 

632 def supported_clock_event_reasons(self) -> list[ClocksEventReasons]: 

633 """ 

634 Retrieves supported :obj:`~ClocksEventReasons` that can be returned by 

635 :meth:`get_current_clock_event_reasons`. 

636  

637 For all fully supported products. 

638  

639 This method is not supported in virtual machines running virtual GPU (vGPU). 

640 """ 

641 cdef uint64_t[1] reasons 

642 reasons[0] = nvml.device_get_supported_clocks_event_reasons(self._handle) 1d

643 output_reasons = [] 1d

644 for reason in _unpack_bitmask(reasons): 1d

645 try: 1d

646 output_reason = _CLOCKS_EVENT_REASONS_MAPPING[1 << reason] 1d

647 except KeyError: 

648 raise ValueError(f"Unknown clock event reason bit: {1 << reason}") 

649 output_reasons.append(output_reason) 1d

650 return output_reasons 1d

651  

652 ########################################################################## 

653 # COOLER 

654 # See external class definitions in _cooler.pxi 

655  

656 @property 

657 def cooler(self) -> CoolerInfo: 

658 """ 

659 :obj:`~_device.CoolerInfo` object with cooler information for the device. 

660 """ 

661 return CoolerInfo(nvml.device_get_cooler_info(self._handle)) 

662  

663 ########################################################################## 

664 # DEVICE ATTRIBUTES 

665 # See external class definitions in _device_attributes.pxi 

666  

667 @property 

668 def attributes(self) -> DeviceAttributes: 

669 """ 

670 :obj:`~_device.DeviceAttributes` object with various device attributes. 

671  

672 For Ampere™ or newer fully supported devices. Only available on Linux 

673 systems. 

674 """ 

675 return DeviceAttributes(nvml.device_get_attributes_v2(self._handle)) 1x

676  

677 ######################################################################### 

678 # DISPLAY 

679  

680 @property 

681 def is_display_connected(self) -> bool: 

682 """ 

683 The display mode for this device. 

684  

685 Indicates whether a physical display (e.g. monitor) is currently connected to 

686 any of the device's connectors. 

687 """ 

688 return nvml.device_get_display_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED 1r

689  

690 @property 

691 def is_display_active(self) -> bool: 

692 """ 

693 The display active status for this device. 

694  

695 Indicates whether a display is initialized on the device. For example, 

696 whether X Server is attached to this device and has allocated memory for 

697 the screen. 

698  

699 Display can be active even when no monitor is physically attached. 

700 """ 

701 return nvml.device_get_display_active(self._handle) == nvml.EnableState.FEATURE_ENABLED 1r

702  

703 ########################################################################## 

704 # EVENTS 

705 # See external class definitions in _event.pxi 

706  

707 def register_events(self, events: EventType | str | list[EventType | str]) -> DeviceEvents: 

708 """ 

709 Starts recording events on this device. 

710  

711 For Fermi™ or newer fully supported devices. For Linux only. 

712  

713 ECC events are available only on ECC-enabled devices (see 

714 :meth:`Device.get_total_ecc_errors`). Power capping events are 

715 available only on Power Management enabled devices (see 

716 :meth:`Device.get_power_management_mode`). 

717  

718 This call starts recording of events on specific device. All events 

719 that occurred before this call are not recorded. Wait for events using 

720 the :meth:`DeviceEvents.wait` method on the result. 

721  

722 Examples 

723 -------- 

724 >>> device = Device(index=0) 

725 >>> events = device.register_events([ 

726 ... EventType.XID_CRITICAL_ERROR, 

727 ... ]) 

728 >>> while event := events.wait(timeout_ms=10000): 

729 ... print(f"Event {event.event_type} occurred on device {event.device.uuid}") 

730  

731 Parameters 

732 ---------- 

733 events: EventType, str, or list of EventType or str 

734 The event type or list of event types to register for this device. 

735  

736 Returns 

737 ------- 

738 :obj:`~_device.DeviceEvents` 

739 An object representing the registered events. Call 

740 :meth:`~_device.DeviceEvents.wait` on this object to wait for events. 

741  

742 Raises 

743 ------ 

744 :class:`cuda.core.system.NotSupportedError` 

745 None of the requested event types are registered. 

746 """ 

747 return DeviceEvents(self._handle, events) 1g

748  

749 def get_supported_event_types(self) -> list[EventType]: 

750 """ 

751 Get the list of event types supported by this device. 

752  

753 For Fermi™ or newer fully supported devices. For Linux only (returns an 

754 empty list on Windows). 

755  

756 Returns 

757 ------- 

758 list[EventType] 

759 The list of supported event types. 

760 """ 

761 cdef uint64_t[1] bitmask 

762 bitmask[0] = nvml.device_get_supported_event_types(self._handle) 1g

763 events = [] 1g

764 for ev in _unpack_bitmask(bitmask): 1g

765 try: 1g

766 ev_enum = _EVENT_TYPE_MAPPING[1 << ev] 1g

767 except KeyError: 

768 raise ValueError(f"Unknown event type bit: {1 << ev}") 

769 events.append(ev_enum) 1g

770 return events 1g

771  

772 ########################################################################## 

773 # FAN 

774 # See external class definitions in _fan.pxi 

775  

776 def get_fan(self, fan: int = 0) -> FanInfo: 

777 """ 

778 :obj:`~_device.FanInfo` object to get information and manage a specific fan on a device. 

779 """ 

780 if fan < 0 or fan >= self.num_fans: 

781 raise ValueError(f"Fan index {fan} is out of range [0, {self.num_fans})") 

782 return FanInfo(self._handle, fan) 

783  

784 @property 

785 def num_fans(self) -> int: 

786 """ 

787 The number of fans on the device. 

788 """ 

789 return nvml.device_get_num_fans(self._handle) 1wF

790  

791 ########################################################################## 

792 # FIELD VALUES 

793 # See external class definitions in _field_values.pxi 

794  

795 def get_field_values(self, field_ids: list[int | tuple[int, int]]) -> FieldValues: 

796 """ 

797 Get multiple field values from the device. 

798  

799 Each value specified can raise its own exception. That exception will 

800 be raised when attempting to access the corresponding ``value`` from the 

801 returned :obj:`~_device.FieldValues` container. 

802  

803 To confirm that there are no exceptions in the entire container, call 

804 :meth:`~_device.FieldValues.validate`. 

805  

806 Parameters 

807 ---------- 

808 field_ids: list[int | tuple[int, int]] 

809 List of field IDs to query. 

810  

811 Each item may be either a single value from the :class:`FieldId` 

812 enum, or a pair of (:class:`FieldId`, scope ID). 

813  

814 Returns 

815 ------- 

816 :obj:`~_device.FieldValues` 

817 Container of field values corresponding to the requested field IDs. 

818 """ 

819 # Passing a field_ids array of length 0 raises an InvalidArgumentError, 

820 # so avoid that. 

821 if len(field_ids) == 0: 1j

822 return FieldValues(nvml.FieldValue(0)) 1j

823  

824 return FieldValues(nvml.device_get_field_values(self._handle, field_ids)) 1j

825  

826 def clear_field_values(self, field_ids: list[int | tuple[int, int]]) -> None: 

827 """ 

828 Clear multiple field values from the device. 

829  

830 Parameters 

831 ---------- 

832 field_ids: list[int | tuple[int, int]] 

833 List of field IDs to clear. 

834  

835 Each item may be either a single value from the :class:`FieldId` 

836 enum, or a pair of (:class:`FieldId`, scope ID). 

837 """ 

838 # Passing a field_ids array of length 0 raises an InvalidArgumentError, 

839 # so avoid that. 

840 if len(field_ids) == 0: 1j

841 return 

842  

843 nvml.device_clear_field_values(self._handle, field_ids) 1j

844  

845 ########################################################################## 

846 # INFOROM 

847 # See external class definitions in _inforom.pxi 

848  

849 @property 

850 def inforom(self) -> InforomInfo: 

851 """ 

852 :obj:`~_device.InforomInfo` object with InfoROM information. 

853  

854 For all products with an InfoROM. 

855 """ 

856 return InforomInfo(self) 1G

857  

858 ########################################################################## 

859 # MEMORY 

860 # See external class definitions in _memory.pxi 

861  

862 @property 

863 def bar1_memory_info(self) -> BAR1MemoryInfo: 

864 """ 

865 :obj:`~_device.BAR1MemoryInfo` object with BAR1 memory information. 

866  

867 BAR1 is used to map the FB (device memory) so that it can be directly 

868 accessed by the CPU or by 3rd party devices (peer-to-peer on the PCIE 

869 bus). 

870 """ 

871 return BAR1MemoryInfo(nvml.device_get_bar1_memory_info(self._handle)) 1y

872  

873 @property 

874 def memory_info(self) -> MemoryInfo: 

875 """ 

876 :obj:`~_device.MemoryInfo` object with memory information. 

877 """ 

878 return MemoryInfo(nvml.device_get_memory_info_v2(self._handle)) 1B

879  

880 ########################################################################## 

881 # NVLINK 

882 # See external class definitions in _nvlink.pxi 

883  

884 def get_nvlink(self, link: int) -> NvlinkInfo: 

885 """ 

886 Get :obj:`~NvlinkInfo` about this device. 

887  

888 For devices with NVLink support. 

889 """ 

890 if link < 0 or link >= NvlinkInfo.max_links: 1s

891 raise ValueError(f"Link index {link} is out of range [0, {NvlinkInfo.max_links})") 

892 return NvlinkInfo(self, link) 1s

893  

894 ########################################################################## 

895 # PCI INFO 

896 # See external class definitions in _pci_info.pxi 

897  

898 @property 

899 def pci_info(self) -> PciInfo: 

900 """ 

901 :obj:`~_device.PciInfo` object with the PCI attributes of this device. 

902  

903 Non-physical devices, such as MIG devices, may not have PCI attributes. 

904 In that case, this property will raise a `RuntimeError`. 

905 """ 

906 try: 1fmhp

907 pci_info = nvml.device_get_pci_info_ext(self._handle) 1fmhp

908 except nvml.InvalidArgumentError: 

909 raise RuntimeError("This device does not have PCI attributes") from None 

910 else: 

911 return PciInfo(pci_info, self._handle) 1fmhp

912  

913 ########################################################################## 

914 # PERFORMANCE 

915 # See external class definitions in _performance.pxi 

916  

917 @property 

918 def performance_state(self) -> int | None: 

919 """ 

920 The current performance state of the device. 

921  

922 For Fermi™ or newer fully supported devices. 

923  

924 Returns 

925 ------- 

926 int | None 

927 The current performance state of the device, as an integer between 0 and 15, 

928 where 0 is maximum performance and higher numbers are lower performance. 

929 Returns `None` if the performance state is unknown. 

930 """ 

931 return _pstate_to_int(nvml.device_get_performance_state(self._handle)) 1ec

932  

933 @property 

934 def dynamic_pstates_info(self) -> GpuDynamicPstatesInfo: 

935 """ 

936 :obj:`~_device.GpuDynamicPstatesInfo` object with performance monitor samples from the associated subdevice. 

937 """ 

938 return GpuDynamicPstatesInfo(nvml.device_get_dynamic_pstates_info(self._handle)) 1c

939  

940 @property 

941 def supported_pstates(self) -> list[int]: 

942 """ 

943 Get all supported Performance States (P-States) for the device. 

944  

945 The returned list contains a contiguous list of valid P-States supported by 

946 the device. 

947  

948 Return 

949 ------ 

950 list[int] 

951 A list of supported performance state of the device, as an integer 

952 between 0 and 15, where 0 is maximum performance and higher numbers 

953 are lower performance. 

954 """ 

955 # From nvml.h: 

956 # The returned array would contain a contiguous list of valid P-States 

957 # supported by the device. If the number of supported P-States is fewer 

958 # than the size of the array supplied missing elements would contain \a 

959 # NVML_PSTATE_UNKNOWN. 

960  

961 pstates = [] 1c

962 for pstate in nvml.device_get_supported_performance_states(self._handle): 1c

963 pstate_value = _pstate_to_int(pstate) 1c

964 if pstate_value is not None: 1c

965 pstates.append(pstate_value) 1c

966 return pstates 1c

967  

968 ########################################################################## 

969 # PROCESS 

970 # See external class definitions in _process.pxi 

971  

972 @property 

973 def compute_running_processes(self) -> list[ProcessInfo]: 

974 """ 

975 Get information about processes with a compute context on a device 

976  

977 For Fermi™ or newer fully supported devices. 

978  

979 This function returns information only about compute running processes 

980 (e.g. CUDA application which have active context). Any graphics 

981 applications (e.g. using OpenGL, DirectX) won't be listed by this 

982 function. 

983  

984 Keep in mind that information returned by this call is dynamic and the 

985 number of elements might change in time. 

986  

987 In MIG mode, if device handle is provided, the API returns aggregate 

988 information, only if the caller has appropriate privileges. Per-instance 

989 information can be queried by using specific MIG device handles. 

990 Querying per-instance information using MIG device handles is not 

991 supported if the device is in vGPU Host virtualization mode. 

992 """ 

993 return [ProcessInfo(self, proc) for proc in nvml.device_get_compute_running_processes_v3(self._handle)] 1qQ

994  

995 ########################################################################## 

996 # REPAIR STATUS 

997 # See external class definitions in _repair_status.pxi 

998  

999 @property 

1000 def repair_status(self) -> RepairStatus: 

1001 """ 

1002 :obj:`~_device.RepairStatus` object with TPC/Channel repair status. 

1003  

1004 For Ampere™ or newer fully supported devices. 

1005 """ 

1006 return RepairStatus(self._handle) 1M

1007  

1008 ########################################################################## 

1009 # TEMPERATURE 

1010 # See external class definitions in _temperature.pxi 

1011  

1012 @property 

1013 def temperature(self) -> Temperature: 

1014 """ 

1015 :obj:`~_device.Temperature` object with temperature information for the device. 

1016 """ 

1017 return Temperature(self._handle) 1N

1018  

1019 ####################################################################### 

1020 # TOPOLOGY 

1021  

1022 def get_topology_nearest_gpus(self, level: GpuTopologyLevel | str) -> Iterable[Device]: 

1023 """ 

1024 Retrieve the GPUs that are nearest to this device at a specific interconnectivity level. 

1025  

1026 Supported on Linux only. 

1027  

1028 Parameters 

1029 ---------- 

1030 level: :class:`GpuTopologyLevel` 

1031 The topology level. 

1032  

1033 Returns 

1034 ------- 

1035 Iterable of :class:`Device` 

1036 The nearest devices at the given topology level. 

1037 """ 

1038 cdef Device device 

1039 try: 1o

1040 level = _GPU_TOPOLOGY_LEVEL_MAPPING[level] 1o

1041 except KeyError: 

1042 raise ValueError( 

1043 f"Invalid topology level: {level}. " 

1044 f"Must be one of {list(GpuTopologyLevel.__members__.values())}" 

1045 ) from None 

1046 for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): 1o

1047 device = Device.__new__(Device) 

1048 device._handle = handle 

1049 yield device 

1050  

1051 ####################################################################### 

1052 # UTILIZATION 

1053  

1054 @property 

1055 def utilization(self) -> Utilization: 

1056 """ 

1057 Retrieves the current :obj:`~Utilization` rates for the device's major 

1058 subsystems. 

1059  

1060 For Fermi™ or newer fully supported devices. 

1061  

1062 Note: During driver initialization when ECC is enabled one can see high 

1063 GPU and Memory Utilization readings. This is caused by ECC Memory 

1064 Scrubbing mechanism that is performed during driver initialization. 

1065  

1066 Note: On MIG-enabled GPUs, querying device utilization rates is not 

1067 currently supported. 

1068  

1069 Returns 

1070 ------- 

1071 Utilization 

1072 An object containing the current utilization rates for the device. 

1073 """ 

1074 return Utilization(nvml.device_get_utilization_rates(self._handle)) 1O

1075  

1076  

1077def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel: 

1078 """ 

1079 Retrieve the common ancestor for two devices. 

1080  

1081 For Linux only. 

1082  

1083 Parameters 

1084 ---------- 

1085 device1: :class:`Device` 

1086 The first device. 

1087 device2: :class:`Device` 

1088 The second device. 

1089  

1090 Returns 

1091 ------- 

1092 :class:`GpuTopologyLevel` 

1093 The common ancestor level of the two devices. 

1094 """ 

1095 return _GPU_TOPOLOGY_LEVEL_INV_MAPPING[ 

1096 nvml.device_get_topology_common_ancestor( 

1097 device1._handle, 

1098 device2._handle, 

1099 ) 

1100 ] 

1101  

1102  

1103def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex | str) -> GpuP2PStatus: 

1104 """ 

1105 Retrieve the P2P status between two devices. 

1106  

1107 Parameters 

1108 ---------- 

1109 device1: :class:`Device` 

1110 The first device. 

1111 device2: :class:`Device` 

1112 The second device. 

1113 index: :class:`GpuP2PCapsIndex` | str 

1114 The P2P capability index being looked for between ``device1`` and ``device2``. 

1115  

1116 Returns 

1117 ------- 

1118 :class:`GpuP2PStatus` 

1119 The P2P status between the two devices. 

1120 """ 

1121 try: 

1122 index_enum = _GPU_P2P_CAPS_INDEX_MAPPING[index] 

1123 except KeyError: 

1124 raise ValueError( 

1125 f"Invalid P2P caps index: {index}. " 

1126 f"Must be one of {list(GpuP2PCapsIndex.__members__.values())}" 

1127 ) from None 

1128 return _GPU_P2P_STATUS_MAPPING.get( 

1129 nvml.device_get_p2p_status( 

1130 device1._handle, 

1131 device2._handle, 

1132 index_enum, 

1133 ), 

1134 GpuP2PStatus.UNKNOWN 

1135 ) 

1136  

1137  

1138__all__ = [ 

1139 "Device", 

1140 "get_p2p_status", 

1141 "get_topology_common_ancestor", 

1142 "NvlinkInfo", 

1143]