Coverage for cuda/core/system/_device.pyx: 74.14%

263 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-03 01:38 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from libc.stdint cimport intptr_t, uint64_t 

6from libc.math cimport ceil 

7  

8from multiprocessing import cpu_count 

9from typing import Iterable, TYPE_CHECKING 

10import warnings 

11  

12from cuda.bindings import nvml 

13  

14from ._nvml_context cimport initialize 

15from cuda.core.system.typing import ( 

16 AddressingMode, 

17 AffinityScope, 

18 DeviceArch, 

19 ClockId, 

20 ClocksEventReasons, 

21 ClockType, 

22 CoolerControl, 

23 CoolerTarget, 

24 DeviceArch, 

25 EventType, 

26 FanControlPolicy, 

27 FieldId, 

28 GpuP2PCapsIndex, 

29 GpuP2PStatus, 

30 GpuTopologyLevel, 

31 InforomObject, 

32 TemperatureThresholds, 

33 ThermalController, 

34 ThermalTarget, 

35) 

36from cuda.core._vendored.deprecated.sphinx import deprecated, versionadded, versionchanged 

37  

38if TYPE_CHECKING: 

39 import cuda.core # no-cython-lint 

40  

41  

42cdef object _pstate_to_int(object pstate): 

43 if pstate == nvml.Pstates.PSTATE_UNKNOWN: 1aec

44 return None 1ac

45 assert ( 1aec

46 int(pstate) >= 0 and int(pstate) <= 15 1aec

47 ), f"Invalid P-state: {pstate}. Must be between 0 and 15 inclusive, or PSTATE_UNKNOWN." 

48 return int(pstate) - int(nvml.Pstates.PSTATE_0) 1aec

49  

50  

51cdef int _pstate_to_enum(int pstate): 

52 if pstate < 0 or pstate > 15: 1e

53 raise ValueError(f"Invalid P-state: {pstate}. Must be between 0 and 15 inclusive.") 

54 return int(pstate) + int(nvml.Pstates.PSTATE_0) 1e

55  

56  

57include "_clock.pxi" 

58include "_cooler.pxi" 

59include "_device_attributes.pxi" 

60include "_device_utils.pxi" 

61include "_event.pxi" 

62include "_fan.pxi" 

63include "_field_values.pxi" 

64include "_inforom.pxi" 

65include "_memory.pxi" 

66include "_mig.pxi" 

67include "_nvlink.pxi" 

68include "_pci_info.pxi" 

69include "_performance.pxi" 

70include "_process.pxi" 

71include "_repair_status.pxi" 

72include "_temperature.pxi" 

73include "_utilization.pxi" 

74  

75  

76_ADDRESSING_MODE_MAPPING = { 

77 nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_HMM: AddressingMode.HMM, 

78 nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_ATS: AddressingMode.ATS, 

79} 

80  

81  

82_AFFINITY_SCOPE_MAPPING = { 

83 AffinityScope.NODE: nvml.AffinityScope.NODE, 

84 AffinityScope.SOCKET: nvml.AffinityScope.SOCKET, 

85} 

86  

87  

88_BRAND_TYPE_MAPPING = { 

89 nvml.BrandType.BRAND_UNKNOWN: "Unknown", 

90 nvml.BrandType.BRAND_QUADRO: "Quadro", 

91 nvml.BrandType.BRAND_TESLA: "Tesla", 

92 nvml.BrandType.BRAND_NVS: "NVS", 

93 nvml.BrandType.BRAND_GRID: "GRID", 

94 nvml.BrandType.BRAND_GEFORCE: "GeForce", 

95 nvml.BrandType.BRAND_TITAN: "Titan", 

96 nvml.BrandType.BRAND_NVIDIA_VAPPS: "NVIDIA vApps", 

97 nvml.BrandType.BRAND_NVIDIA_VPC: "NVIDIA VPC", 

98 nvml.BrandType.BRAND_NVIDIA_VCS: "NVIDIA VCS", 

99 nvml.BrandType.BRAND_NVIDIA_VWS: "NVIDIA VWS", 

100 nvml.BrandType.BRAND_NVIDIA_CLOUD_GAMING: "NVIDIA Cloud Gaming", 

101 nvml.BrandType.BRAND_NVIDIA_VGAMING: "NVIDIA vGaming", 

102 nvml.BrandType.BRAND_QUADRO_RTX: "Quadro RTX", 

103 nvml.BrandType.BRAND_NVIDIA_RTX: "NVIDIA RTX", 

104 nvml.BrandType.BRAND_NVIDIA: "NVIDIA", 

105 nvml.BrandType.BRAND_GEFORCE_RTX: "GeForce RTX", 

106 nvml.BrandType.BRAND_TITAN_RTX: "Titan RTX", 

107} 

108  

109  

110_GPU_P2P_CAPS_INDEX_MAPPING = { 

111 GpuP2PCapsIndex.READ: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_READ, 

112 GpuP2PCapsIndex.WRITE: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_WRITE, 

113 GpuP2PCapsIndex.NVLINK: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_NVLINK, 

114 GpuP2PCapsIndex.ATOMICS: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_ATOMICS, 

115 GpuP2PCapsIndex.PCI: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PCI, 

116 GpuP2PCapsIndex.PROP: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PROP, 

117 GpuP2PCapsIndex.UNKNOWN: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_UNKNOWN, 

118} 

119  

120  

121_GPU_P2P_STATUS_MAPPING = { 

122 nvml.GpuP2PStatus.P2P_STATUS_OK: GpuP2PStatus.OK, 

123 nvml.GpuP2PStatus.P2P_STATUS_CHIPSET_NOT_SUPPORTED: GpuP2PStatus.CHIPSET_NOT_SUPPORTED, 

124 nvml.GpuP2PStatus.P2P_STATUS_GPU_NOT_SUPPORTED: GpuP2PStatus.GPU_NOT_SUPPORTED, 

125 nvml.GpuP2PStatus.P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED: GpuP2PStatus.IOH_TOPOLOGY_NOT_SUPPORTED, 

126 nvml.GpuP2PStatus.P2P_STATUS_DISABLED_BY_REGKEY: GpuP2PStatus.DISABLED_BY_REGKEY, 

127 nvml.GpuP2PStatus.P2P_STATUS_NOT_SUPPORTED: GpuP2PStatus.NOT_SUPPORTED, 

128 nvml.GpuP2PStatus.P2P_STATUS_UNKNOWN: GpuP2PStatus.UNKNOWN, 

129} 

130  

131  

132_GPU_TOPOLOGY_LEVEL_MAPPING = { 

133 GpuTopologyLevel.INTERNAL: nvml.GpuTopologyLevel.TOPOLOGY_INTERNAL, 

134 GpuTopologyLevel.SINGLE: nvml.GpuTopologyLevel.TOPOLOGY_SINGLE, 

135 GpuTopologyLevel.MULTIPLE: nvml.GpuTopologyLevel.TOPOLOGY_MULTIPLE, 

136 GpuTopologyLevel.HOSTBRIDGE: nvml.GpuTopologyLevel.TOPOLOGY_HOSTBRIDGE, 

137 GpuTopologyLevel.NODE: nvml.GpuTopologyLevel.TOPOLOGY_NODE, 

138 GpuTopologyLevel.SYSTEM: nvml.GpuTopologyLevel.TOPOLOGY_SYSTEM, 

139} 

140  

141  

142_GPU_TOPOLOGY_LEVEL_INV_MAPPING = {v: k for k, v in _GPU_TOPOLOGY_LEVEL_MAPPING.items()} 

143  

144  

145  

146cdef class Device: 

147 """ 

148 Representation of a device. 

149  

150 :class:`cuda.core.system.Device` provides access to various pieces of metadata 

151 about devices and their topology, as provided by the NVIDIA Management 

152 Library (NVML). To use CUDA with a device, use :class:`cuda.core.Device`. 

153  

154 Creating a device instance causes NVML to initialize the target GPU. 

155 NVML may initialize additional GPUs if the target GPU is an SLI slave. 

156  

157 Parameters 

158 ---------- 

159 index: int, optional 

160 Integer representing the CUDA device index to get a handle to. Valid 

161 values are between ``0`` and ``cuda.core.system.get_num_devices() - 1``. 

162  

163 The order in which devices are enumerated has no guarantees of 

164 consistency between reboots. For that reason, it is recommended that 

165 devices are looked up by their PCI ids or UUID. 

166  

167 uuid: bytes or str, optional 

168 UUID of a CUDA device to get a handle to. 

169  

170 pci_bus_id: bytes or str, optional 

171 PCI bus ID of a CUDA device to get a handle to. 

172  

173 Raises 

174 ------ 

175 ValueError 

176 If anything other than a single `index`, `uuid` or `pci_bus_id` are specified. 

177 """ 

178  

179 # This is made public for testing purposes only 

180 cdef public intptr_t _handle 

181  

182 def __init__( 

183 self, 

184 *, 

185 index: int | None = None, 

186 uuid: bytes | str | None = None, 

187 pci_bus_id: bytes | str | None = None, 

188 ) -> None: 

189 args = [index, uuid, pci_bus_id] 1tbuvderwlxyziABCfmDEnsFjGHoIJKLpkcgMNhOPQq

190 cdef int arg_count = sum(arg is not None for arg in args) 1tbuvderwlxyziABCfmDEnsFjGHoIJKLpkcgMNhOPQq

191  

192 if arg_count > 1: 1tbuvderwlxyziABCfmDEnsFjGHoIJKLpkcgMNhOPQq

193 raise ValueError("Handle requires only one of `index`, `uuid`, or `pci_bus_id`.") 

194 if arg_count == 0: 1atbuvderwlxyziABCfmDEnsFjGHoIJKLpkcgMNhOPQq

195 raise ValueError("Handle requires either a device `index`, `uuid`, or `pci_bus_id`.") 

196  

197 initialize() 1tbuvderwlxyziABCfmDEnsFjGHoIJKLpkcgMNhOPQq

198  

199 if index is not None: 1tbuvderwlxyziABCfmDEnsFjGHoIJKLpkcgMNhOPQq

200 self._handle = nvml.device_get_handle_by_index_v2(index) 1tbuvderwlxyziABCfmDEnsFjGHoIJKLpkcgMNhOPQ

201 elif uuid is not None: 1fq

202 if isinstance(uuid, bytes): 1q

203 uuid = uuid.decode("ascii") 

204 self._handle = nvml.device_get_handle_by_uuid(uuid) 1q

205 elif pci_bus_id is not None: 1f

206 if isinstance(pci_bus_id, bytes): 1f

207 pci_bus_id = pci_bus_id.decode("ascii") 

208 self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) 1f

209  

210 ######################################################################### 

211 # BASIC PROPERTIES 

212  

213 @property 

214 def index(self) -> int: 

215 """ 

216 The NVML index of this device. 

217  

218 Valid indices are derived from the count returned by 

219 :meth:`Device.get_device_count`. For example, if ``get_device_count()`` 

220 returns 2, the valid indices are 0 and 1, corresponding to GPU 0 and GPU 

221 1. 

222  

223 The order in which NVML enumerates devices has no guarantees of 

224 consistency between reboots. For that reason, it is recommended that 

225 devices be looked up by their PCI ids or GPU UUID. 

226  

227 Note: The NVML index may not correlate with other APIs, such as the CUDA 

228 device index. 

229 """ 

230 return nvml.device_get_index(self._handle) 1fI

231  

232 @property 

233 def uuid(self) -> str: 

234 """ 

235 Retrieves the globally unique immutable UUID associated with this 

236 device, as a 5 part hexadecimal string, that augments the immutable, 

237 board serial identifier. 

238  

239 In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-`` 

240 prefix. If you need a `uuid` without that prefix (for example, to 

241 interact with CUDA), use the `uuid_without_prefix` property. 

242 """ 

243 return nvml.device_get_uuid(self._handle) 1P

244  

245 @property 

246 def uuid_without_prefix(self) -> str: 

247 """ 

248 Retrieves the globally unique immutable UUID associated with this 

249 device, as a 5 part hexadecimal string, that augments the immutable, 

250 board serial identifier. 

251  

252 In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-`` 

253 prefix. This property returns it without the prefix, to match the UUIDs 

254 used in CUDA. If you need the prefix, use the `uuid` property. 

255 """ 

256 # NVML UUIDs have a `gpu-` or `mig-` prefix. We remove that here. 

257 return nvml.device_get_uuid(self._handle)[4:] 1Ehq

258  

259 @property 

260 def pci_bus_id(self) -> str: 

261 """ 

262 Retrieves the PCI bus ID of this device. 

263 """ 

264 return self.pci_info.bus_id 

265  

266 @property 

267 def numa_node_id(self) -> int: 

268 """ 

269 The NUMA node of the given GPU device. 

270  

271 This only applies to platforms where the GPUs are NUMA nodes. 

272 """ 

273 return nvml.device_get_numa_node_id(self._handle) 1L

274  

275 @property 

276 def arch(self) -> DeviceArch: 

277 """ 

278 :obj:`~DeviceArch` device architecture. 

279  

280 For example, a Tesla V100 will report ``DeviceArchitecture.name == 

281 "VOLTA"``, and RTX A6000 will report ``DeviceArchitecture.name == 

282 "AMPERE"``. 

283 """ 

284 arch = nvml.device_get_architecture(self._handle) 1ln

285 try: 1ln

286 return DeviceArch(arch) 1ln

287 except ValueError: 

288 return DeviceArch.UNKNOWN 

289  

290 @property 

291 def name(self) -> str: 

292 """ 

293 Name of the device, e.g.: `"Tesla V100-SXM2-32GB"` 

294 """ 

295 return nvml.device_get_name(self._handle) 1C

296  

297 @property 

298 def brand(self) -> str: 

299 """ 

300 The brand of the device. 

301  

302 Returns "Unknown" if the brand is unknown. 

303 """ 

304 return _BRAND_TYPE_MAPPING.get(nvml.device_get_brand(self._handle), "Unknown") 1z

305  

306 @property 

307 def serial(self) -> str: 

308 """ 

309 Retrieves the globally unique board serial number associated with this 

310 device's board. 

311  

312 For all products with an InfoROM. 

313 """ 

314 return nvml.device_get_serial(self._handle) 1D

315  

316 @property 

317 def module_id(self) -> int: 

318 """ 

319 Get a unique identifier for the device module on the baseboard. 

320  

321 This API retrieves a unique identifier for each GPU module that exists 

322 on a given baseboard. For non-baseboard products, this ID would always 

323 be 0. 

324 """ 

325 return nvml.device_get_module_id(self._handle) 1K

326  

327 @property 

328 def minor_number(self) -> int: 

329 """ 

330 The minor number of this device. 

331  

332 For Linux only. 

333  

334 The minor number is used by the Linux device driver to identify the 

335 device node in ``/dev/nvidiaX``. 

336 """ 

337 return nvml.device_get_minor_number(self._handle) 1H

338  

339 @property 

340 def is_c2c_enabled(self) -> bool: 

341 """ 

342 Whether the C2C (Chip-to-Chip) mode is enabled for this device. 

343 """ 

344 return bool(nvml.device_get_c2c_mode_info_v(self._handle).is_c2c_enabled) 1v

345  

346 @property 

347 def is_persistence_mode_enabled(self) -> bool: 

348 """ 

349 Whether persistence mode is enabled for this device. 

350  

351 For Linux only. 

352 """ 

353 return nvml.device_get_persistence_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED 1k

354  

355 @is_persistence_mode_enabled.setter 

356 def is_persistence_mode_enabled(self, enabled: bool) -> None: 

357 nvml.device_set_persistence_mode( 1k

358 self._handle, 1k

359 nvml.EnableState.FEATURE_ENABLED if enabled else nvml.EnableState.FEATURE_DISABLED 1k

360 ) 

361  

362 @property 

363 def cuda_compute_capability(self) -> tuple[int, int]: 

364 """ 

365 CUDA compute capability of the device, e.g.: `(7, 0)` for a Tesla V100. 

366  

367 Returns a tuple `(major, minor)`. 

368 """ 

369 return nvml.device_get_cuda_compute_capability(self._handle) 1A

370  

371 def to_cuda_device(self) -> "cuda.core.Device": 

372 """ 

373 Get the corresponding :class:`cuda.core.Device` (which is used for CUDA 

374 access) for this :class:`cuda.core.system.Device` (which is used for 

375 NVIDIA machine library (NVML) access). 

376  

377 The devices are mapped to one another by their UUID. 

378  

379 Returns 

380 ------- 

381 cuda.core.Device 

382 The corresponding CUDA device. 

383  

384 Raises 

385 ------ 

386 RuntimeError 

387 No corresponding CUDA device is found for this NVML device. 

388  

389 For example, on a MIG system, the physical GPU will not have an 

390 available CUDA device, since it can not be used directly, even 

391 though it can be enumerated from NVML. 

392 """ 

393 from cuda.core import Device as CudaDevice 1h

394  

395 # CUDA does not have an API to get a device by its UUID, so we just 

396 # search all the devices for one with a matching UUID. 

397  

398 for cuda_device in CudaDevice.get_all_devices(): 1h

399 if cuda_device.uuid == self.uuid_without_prefix: 1h

400 return cuda_device 1h

401  

402 raise RuntimeError("No corresponding CUDA device found for this NVML device.") 

403  

404 @classmethod 

405 def get_device_count(cls) -> int: 

406 """ 

407 Get the number of available devices. 

408  

409 Returns 

410 ------- 

411 int 

412 The number of available devices. 

413 """ 

414 initialize() 1STU

415  

416 return nvml.device_get_count_v2() 1STU

417  

418 @classmethod 

419 def get_all_devices(cls) -> Iterable[Device]: 

420 """ 

421 Query the available device instances. 

422  

423 Returns 

424 ------- 

425 Iterator over :obj:`~Device` 

426 An iterator over available devices. 

427 """ 

428 initialize() 1tbuvderwlxyziABCfmDEnsFjGHoIJKLpkcgMNhOPQ

429  

430 for device_id in range(nvml.device_get_count_v2()): 1tbuvderwlxyziABCfmDEnsFjGHoIJKLpkcgMNhOPQ

431 yield cls(index=device_id) 1tbuvderwlxyziABCfmDEnsFjGHoIJKLpkcgMNhOPQ

432  

433 ######################################################################### 

434 # ADDRESSING MODE 

435  

436 @property 

437 def addressing_mode(self) -> AddressingMode | None: 

438 """ 

439 Get the :obj:`~AddressingMode` of the device. 

440 """ 

441 return _ADDRESSING_MODE_MAPPING.get(nvml.device_get_addressing_mode(self._handle).value, None) 1t

442  

443 ######################################################################### 

444 # MIG (MULTI-INSTANCE GPU) DEVICES 

445  

446 @property 

447 def mig(self) -> MigInfo: 

448 """ 

449 Get :obj:`~MigInfo` accessor for MIG (Multi-Instance GPU) information. 

450  

451 For Ampere™ or newer fully supported devices. 

452 """ 

453 return MigInfo(self) 1rJ

454  

455 ######################################################################### 

456 # AFFINITY 

457  

458 @classmethod 

459 def get_all_devices_with_cpu_affinity(cls, cpu_index: int) -> Iterable[Device]: 

460 """ 

461 Retrieve the set of GPUs that have a CPU affinity with the given CPU number. 

462  

463 Supported on Linux only. 

464  

465 Parameters 

466 ---------- 

467 cpu_index: int 

468 The CPU index. 

469  

470 Returns 

471 ------- 

472 Iterator of :obj:`~Device` 

473 An iterator over available devices. 

474 """ 

475 cdef Device device 

476 for handle in nvml.system_get_topology_gpu_set(cpu_index): 1R

477 device = Device.__new__(Device) 1R

478 device._handle = handle 1R

479 yield device 1R

480  

481 def get_memory_affinity(self, scope: AffinityScope | str=AffinityScope.NODE) -> list[int]: 

482 """ 

483 Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal 

484 memory affinity for the device. 

485  

486 For Kepler™ or newer fully supported devices. 

487  

488 Supported on Linux only. 

489  

490 If requested scope is not applicable to the target topology, the API 

491 will fall back to reporting the memory affinity for the immediate non-I/O 

492 ancestor of the device. 

493  

494 Parameters 

495 ---------- 

496 scope: AffinityScope | str, optional 

497 The scope of the affinity query. Must be one of the values of 

498 :class:`AffinityScope`. Default is :attr:`AffinityScope.NODE`. 

499  

500 Returns 

501 ------- 

502 list[int] 

503 A list of indices of NUMA nodes or CPU sockets with the ideal memory 

504 affinity for the device. 

505 """ 

506 try: 1b

507 scope = _AFFINITY_SCOPE_MAPPING[scope] 1b

508 except KeyError: 

509 raise ValueError( 

510 f"Invalid affinity scope: {scope}. " 

511 f"Must be one of {list(AffinityScope.__members__.values())}" 

512 ) from None 

513 return _unpack_bitmask( 1b

514 nvml.device_get_memory_affinity( 1b

515 self._handle, 1b

516 <unsigned int>ceil(cpu_count() / 64), 1b

517 scope, 1b

518 ) 

519 ) 

520  

521 def get_cpu_affinity(self, scope: AffinityScope | str=AffinityScope.NODE) -> list[int]: 

522 """ 

523 Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal 

524 CPU affinity for the device. 

525  

526 For Kepler™ or newer fully supported devices. 

527  

528 Supported on Linux only. 

529  

530 If requested scope is not applicable to the target topology, the API 

531 will fall back to reporting the memory affinity for the immediate non-I/O 

532 ancestor of the device. 

533  

534 Parameters 

535 ---------- 

536 scope: AffinityScope | str, optional 

537 The scope of the affinity query. Must be one of the values of 

538 :class:`AffinityScope`. Default is :attr:`AffinityScope.NODE`. 

539  

540 Returns 

541 ------- 

542 list[int] 

543 A list of indices of NUMA nodes or CPU sockets with the ideal memory 

544 affinity for the device. 

545 """ 

546 try: 1biR

547 scope = _AFFINITY_SCOPE_MAPPING[scope] 1biR

548 except KeyError: 

549 raise ValueError( 

550 f"Invalid affinity scope: {scope}. " 

551 f"Must be one of {list(AffinityScope.__members__.values())}" 

552 ) from None 

553 return _unpack_bitmask( 1biR

554 nvml.device_get_cpu_affinity_within_scope( 1biR

555 self._handle, 1biR

556 <unsigned int>ceil(cpu_count() / 64), 1biR

557 scope, 1biR

558 ) 

559 ) 

560  

561 def set_cpu_affinity(self) -> None: 

562 """ 

563 Sets the ideal affinity for the calling thread and device. 

564  

565 For Kepler™ or newer fully supported devices. 

566  

567 Supported on Linux only. 

568 """ 

569 nvml.device_set_cpu_affinity(self._handle) 

570  

571 def clear_cpu_affinity(self) -> None: 

572 """ 

573 Clear all affinity bindings for the calling thread. 

574  

575 For Kepler™ or newer fully supported devices. 

576  

577 Supported on Linux only. 

578 """ 

579 nvml.device_clear_cpu_affinity(self._handle) 

580  

581 ######################################################################### 

582 # CLOCK 

583 # See external class definitions in _clock.pxi 

584  

585 def get_clock(self, clock_type: ClockType | str) -> ClockInfo: 

586 """ 

587 :obj:`~_device.ClockInfo` object to get information about and manage a specific clock on a device. 

588 """ 

589 return ClockInfo(self._handle, clock_type) 1e

590  

591 @property 

592 def is_auto_boosted_clocks_enabled(self) -> tuple[bool, bool]: 

593 """ 

594 Retrieve the current state of auto boosted clocks on a device. 

595  

596 For Kepler™ or newer fully supported devices. 

597  

598 Auto Boosted clocks are enabled by default on some hardware, allowing 

599 the GPU to run at higher clock rates to maximize performance as thermal 

600 limits allow. 

601  

602 On Pascal™ and newer hardware, Auto Boosted clocks are controlled 

603 through application clocks. Use :meth:`set_application_clocks` and 

604 :meth:`reset_application_clocks` to control Auto Boost behavior. 

605  

606 Returns 

607 ------- 

608 bool 

609 The current state of Auto Boosted clocks 

610 bool 

611 The default Auto Boosted clocks behavior 

612  

613 """ 

614 current, default = nvml.device_get_auto_boosted_clocks_enabled(self._handle) 1u

615 return current == nvml.EnableState.FEATURE_ENABLED, default == nvml.EnableState.FEATURE_ENABLED 

616  

617 @property 

618 def current_clock_event_reasons(self) -> list[ClocksEventReasons]: 

619 """ 

620 Retrieves the current :obj:`~ClocksEventReasons`. 

621  

622 For all fully supported products. 

623 """ 

624 cdef uint64_t[1] reasons 

625 reasons[0] = nvml.device_get_current_clocks_event_reasons(self._handle) 1d

626 output_reasons = [] 1d

627 for reason in _unpack_bitmask(reasons): 1d

628 try: 

629 output_reason = _CLOCKS_EVENT_REASONS_MAPPING[1 << reason] 

630 except KeyError: 

631 raise ValueError(f"Unknown clock event reason bit: {1 << reason}") 

632 output_reasons.append(output_reason) 

633 return output_reasons 1d

634  

635 @property 

636 def supported_clock_event_reasons(self) -> list[ClocksEventReasons]: 

637 """ 

638 Retrieves supported :obj:`~ClocksEventReasons` that can be returned by 

639 :meth:`get_current_clock_event_reasons`. 

640  

641 For all fully supported products. 

642  

643 This method is not supported in virtual machines running virtual GPU (vGPU). 

644 """ 

645 cdef uint64_t[1] reasons 

646 reasons[0] = nvml.device_get_supported_clocks_event_reasons(self._handle) 1d

647 output_reasons = [] 1d

648 for reason in _unpack_bitmask(reasons): 1d

649 try: 1d

650 output_reason = _CLOCKS_EVENT_REASONS_MAPPING[1 << reason] 1d

651 except KeyError: 

652 raise ValueError(f"Unknown clock event reason bit: {1 << reason}") 

653 output_reasons.append(output_reason) 1d

654 return output_reasons 1d

655  

656 ########################################################################## 

657 # COOLER 

658 # See external class definitions in _cooler.pxi 

659  

660 @property 

661 def cooler(self) -> CoolerInfo: 

662 """ 

663 :obj:`~_device.CoolerInfo` object with cooler information for the device. 

664 """ 

665 return CoolerInfo(nvml.device_get_cooler_info(self._handle)) 

666  

667 ########################################################################## 

668 # DEVICE ATTRIBUTES 

669 # See external class definitions in _device_attributes.pxi 

670  

671 @property 

672 def attributes(self) -> DeviceAttributes: 

673 """ 

674 :obj:`~_device.DeviceAttributes` object with various device attributes. 

675  

676 For Ampere™ or newer fully supported devices. Only available on Linux 

677 systems. 

678 """ 

679 return DeviceAttributes(nvml.device_get_attributes_v2(self._handle)) 1x

680  

681 ######################################################################### 

682 # DISPLAY 

683  

684 @property 

685 def is_display_connected(self) -> bool: 

686 """ 

687 The display mode for this device. 

688  

689 Indicates whether a physical display (e.g. monitor) is currently connected to 

690 any of the device's connectors. 

691 """ 

692 return nvml.device_get_display_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED 1s

693  

694 @property 

695 def is_display_active(self) -> bool: 

696 """ 

697 The display active status for this device. 

698  

699 Indicates whether a display is initialized on the device. For example, 

700 whether X Server is attached to this device and has allocated memory for 

701 the screen. 

702  

703 Display can be active even when no monitor is physically attached. 

704 """ 

705 return nvml.device_get_display_active(self._handle) == nvml.EnableState.FEATURE_ENABLED 1s

706  

707 ########################################################################## 

708 # EVENTS 

709 # See external class definitions in _event.pxi 

710  

711 def register_events(self, events: EventType | str | list[EventType | str]) -> DeviceEvents: 

712 """ 

713 Starts recording events on this device. 

714  

715 For Fermi™ or newer fully supported devices. For Linux only. 

716  

717 ECC events are available only on ECC-enabled devices (see 

718 :meth:`Device.get_total_ecc_errors`). Power capping events are 

719 available only on Power Management enabled devices (see 

720 :meth:`Device.get_power_management_mode`). 

721  

722 This call starts recording of events on specific device. All events 

723 that occurred before this call are not recorded. Wait for events using 

724 the :meth:`DeviceEvents.wait` method on the result. 

725  

726 Examples 

727 -------- 

728 >>> device = Device(index=0) 

729 >>> events = device.register_events([ 

730 ... EventType.XID_CRITICAL_ERROR, 

731 ... ]) 

732 >>> while event := events.wait(timeout_ms=10000): 

733 ... print(f"Event {event.event_type} occurred on device {event.device.uuid}") 

734  

735 Parameters 

736 ---------- 

737 events: EventType, str, or list of EventType or str 

738 The event type or list of event types to register for this device. 

739  

740 Returns 

741 ------- 

742 :obj:`~_device.DeviceEvents` 

743 An object representing the registered events. Call 

744 :meth:`~_device.DeviceEvents.wait` on this object to wait for events. 

745  

746 Raises 

747 ------ 

748 :class:`cuda.core.system.NotSupportedError` 

749 None of the requested event types are registered. 

750 """ 

751 return DeviceEvents(self._handle, events) 1g

752  

753 def get_supported_event_types(self) -> list[EventType]: 

754 """ 

755 Get the list of event types supported by this device. 

756  

757 For Fermi™ or newer fully supported devices. For Linux only (returns an 

758 empty list on Windows). 

759  

760 Returns 

761 ------- 

762 list[EventType] 

763 The list of supported event types. 

764 """ 

765 cdef uint64_t[1] bitmask 

766 bitmask[0] = nvml.device_get_supported_event_types(self._handle) 1g

767 events = [] 1g

768 for ev in _unpack_bitmask(bitmask): 1g

769 try: 1g

770 ev_enum = _EVENT_TYPE_MAPPING[1 << ev] 1g

771 except KeyError: 

772 raise ValueError(f"Unknown event type bit: {1 << ev}") 

773 events.append(ev_enum) 1g

774 return events 1g

775  

776 ########################################################################## 

777 # FAN 

778 # See external class definitions in _fan.pxi 

779  

780 def get_fan(self, fan: int = 0) -> FanInfo: 

781 """ 

782 :obj:`~_device.FanInfo` object to get information and manage a specific fan on a device. 

783 """ 

784 if fan < 0 or fan >= self.num_fans: 

785 raise ValueError(f"Fan index {fan} is out of range [0, {self.num_fans})") 

786 return FanInfo(self._handle, fan) 

787  

788 @property 

789 def num_fans(self) -> int: 

790 """ 

791 The number of fans on the device. 

792 """ 

793 return nvml.device_get_num_fans(self._handle) 1wF

794  

795 ########################################################################## 

796 # FIELD VALUES 

797 # See external class definitions in _field_values.pxi 

798  

799 def get_field_values(self, field_ids: list[int | tuple[int, int]]) -> FieldValues: 

800 """ 

801 Get multiple field values from the device. 

802  

803 Each value specified can raise its own exception. That exception will 

804 be raised when attempting to access the corresponding ``value`` from the 

805 returned :obj:`~_device.FieldValues` container. 

806  

807 To confirm that there are no exceptions in the entire container, call 

808 :meth:`~_device.FieldValues.validate`. 

809  

810 Parameters 

811 ---------- 

812 field_ids: list[int | tuple[int, int]] 

813 List of field IDs to query. 

814  

815 Each item may be either a single value from the :class:`FieldId` 

816 enum, or a pair of (:class:`FieldId`, scope ID). 

817  

818 Returns 

819 ------- 

820 :obj:`~_device.FieldValues` 

821 Container of field values corresponding to the requested field IDs. 

822 """ 

823 # Passing a field_ids array of length 0 raises an InvalidArgumentError, 

824 # so avoid that. 

825 if len(field_ids) == 0: 1jp

826 return FieldValues(nvml.FieldValue(0)) 1j

827  

828 return FieldValues(nvml.device_get_field_values(self._handle, field_ids)) 1jp

829  

830 def clear_field_values(self, field_ids: list[int | tuple[int, int]]) -> None: 

831 """ 

832 Clear multiple field values from the device. 

833  

834 Parameters 

835 ---------- 

836 field_ids: list[int | tuple[int, int]] 

837 List of field IDs to clear. 

838  

839 Each item may be either a single value from the :class:`FieldId` 

840 enum, or a pair of (:class:`FieldId`, scope ID). 

841 """ 

842 # Passing a field_ids array of length 0 raises an InvalidArgumentError, 

843 # so avoid that. 

844 if len(field_ids) == 0: 1j

845 return 

846  

847 nvml.device_clear_field_values(self._handle, field_ids) 1j

848  

849 ########################################################################## 

850 # INFOROM 

851 # See external class definitions in _inforom.pxi 

852  

853 @property 

854 def inforom(self) -> InforomInfo: 

855 """ 

856 :obj:`~_device.InforomInfo` object with InfoROM information. 

857  

858 For all products with an InfoROM. 

859 """ 

860 return InforomInfo(self) 1G

861  

862 ########################################################################## 

863 # MEMORY 

864 # See external class definitions in _memory.pxi 

865  

866 @property 

867 def bar1_memory_info(self) -> BAR1MemoryInfo: 

868 """ 

869 :obj:`~_device.BAR1MemoryInfo` object with BAR1 memory information. 

870  

871 BAR1 is used to map the FB (device memory) so that it can be directly 

872 accessed by the CPU or by 3rd party devices (peer-to-peer on the PCIE 

873 bus). 

874 """ 

875 return BAR1MemoryInfo(nvml.device_get_bar1_memory_info(self._handle)) 1y

876  

877 @property 

878 def memory_info(self) -> MemoryInfo: 

879 """ 

880 :obj:`~_device.MemoryInfo` object with memory information. 

881 """ 

882 return MemoryInfo(nvml.device_get_memory_info_v2(self._handle)) 1B

883  

884 ########################################################################## 

885 # NVLINK 

886 # See external class definitions in _nvlink.pxi 

887  

888 @versionchanged( 

889 version="1.1.0", 

890 reason="Any link number not supported by this specific device will raise a `ValueError`." 

891 ) 

892 def get_nvlink(self, link: int) -> NvlinkInfo: 

893 """ 

894 Get :obj:`~NvlinkInfo` about this device. 

895  

896 For devices with NVLink support. 

897 """ 

898 link_count = self.get_nvlink_count() 

899 if link < 0 or link >= link_count: 

900 raise ValueError(f"Link index {link} is out of range [0, {link_count})") 

901 return NvlinkInfo(self, link) 

902  

903 @versionadded(version="1.1.0") 

904 def get_nvlink_count(self) -> int: 

905 """ 

906 Get the number of NVLink links on this device. 

907  

908 For devices with NVLink support. 

909 """ 

910 return self.get_field_values([FieldId.DEV_NVLINK_LINK_COUNT])[0].value 1p

911  

912 @versionadded(version="1.1.0") 

913 def get_nvlinks(self) -> Iterable[NvlinkInfo]: 

914 """ 

915 Get :obj:`~NvlinkInfo` about all NVLink links on this device. 

916  

917 For devices with NVLink support. 

918 """ 

919 for link in range(self.get_nvlink_count()): 

920 yield self.get_nvlink(link) 

921  

922 ########################################################################## 

923 # PCI INFO 

924 # See external class definitions in _pci_info.pxi 

925  

926 @property 

927 def pci_info(self) -> PciInfo: 

928 """ 

929 :obj:`~_device.PciInfo` object with the PCI attributes of this device. 

930  

931 Non-physical devices, such as MIG devices, may not have PCI attributes. 

932 In that case, this property will raise a `RuntimeError`. 

933 """ 

934 try: 1fmhq

935 pci_info = nvml.device_get_pci_info_ext(self._handle) 1fmhq

936 except nvml.InvalidArgumentError: 

937 raise RuntimeError("This device does not have PCI attributes") from None 

938 else: 

939 return PciInfo(pci_info, self._handle) 1fmhq

940  

941 ########################################################################## 

942 # PERFORMANCE 

943 # See external class definitions in _performance.pxi 

944  

945 @property 

946 def performance_state(self) -> int | None: 

947 """ 

948 The current performance state of the device. 

949  

950 For Fermi™ or newer fully supported devices. 

951  

952 Returns 

953 ------- 

954 int | None 

955 The current performance state of the device, as an integer between 0 and 15, 

956 where 0 is maximum performance and higher numbers are lower performance. 

957 Returns `None` if the performance state is unknown. 

958 """ 

959 return _pstate_to_int(nvml.device_get_performance_state(self._handle)) 1ec

960  

961 @property 

962 def dynamic_pstates_info(self) -> GpuDynamicPstatesInfo: 

963 """ 

964 :obj:`~_device.GpuDynamicPstatesInfo` object with performance monitor samples from the associated subdevice. 

965 """ 

966 return GpuDynamicPstatesInfo(nvml.device_get_dynamic_pstates_info(self._handle)) 1c

967  

968 @property 

969 def supported_pstates(self) -> list[int]: 

970 """ 

971 Get all supported Performance States (P-States) for the device. 

972  

973 The returned list contains a contiguous list of valid P-States supported by 

974 the device. 

975  

976 Return 

977 ------ 

978 list[int] 

979 A list of supported performance state of the device, as an integer 

980 between 0 and 15, where 0 is maximum performance and higher numbers 

981 are lower performance. 

982 """ 

983 # From nvml.h: 

984 # The returned array would contain a contiguous list of valid P-States 

985 # supported by the device. If the number of supported P-States is fewer 

986 # than the size of the array supplied missing elements would contain \a 

987 # NVML_PSTATE_UNKNOWN. 

988  

989 pstates = [] 1c

990 for pstate in nvml.device_get_supported_performance_states(self._handle): 1c

991 pstate_value = _pstate_to_int(pstate) 1c

992 if pstate_value is not None: 1c

993 pstates.append(pstate_value) 1c

994 return pstates 1c

995  

996 ########################################################################## 

997 # PROCESS 

998 # See external class definitions in _process.pxi 

999  

1000 @property 

1001 def compute_running_processes(self) -> list[ProcessInfo]: 

1002 """ 

1003 Get information about processes with a compute context on a device 

1004  

1005 For Fermi™ or newer fully supported devices. 

1006  

1007 This function returns information only about compute running processes 

1008 (e.g. CUDA application which have active context). Any graphics 

1009 applications (e.g. using OpenGL, DirectX) won't be listed by this 

1010 function. 

1011  

1012 Keep in mind that information returned by this call is dynamic and the 

1013 number of elements might change in time. 

1014  

1015 In MIG mode, if device handle is provided, the API returns aggregate 

1016 information, only if the caller has appropriate privileges. Per-instance 

1017 information can be queried by using specific MIG device handles. 

1018 Querying per-instance information using MIG device handles is not 

1019 supported if the device is in vGPU Host virtualization mode. 

1020 """ 

1021 return [ProcessInfo(self, proc) for proc in nvml.device_get_compute_running_processes_v3(self._handle)] 1rQ

1022  

1023 ########################################################################## 

1024 # REPAIR STATUS 

1025 # See external class definitions in _repair_status.pxi 

1026  

1027 @property 

1028 def repair_status(self) -> RepairStatus: 

1029 """ 

1030 :obj:`~_device.RepairStatus` object with TPC/Channel repair status. 

1031  

1032 For Ampere™ or newer fully supported devices. 

1033 """ 

1034 return RepairStatus(self._handle) 1M

1035  

1036 ########################################################################## 

1037 # TEMPERATURE 

1038 # See external class definitions in _temperature.pxi 

1039  

1040 @property 

1041 def temperature(self) -> Temperature: 

1042 """ 

1043 :obj:`~_device.Temperature` object with temperature information for the device. 

1044 """ 

1045 return Temperature(self._handle) 1N

1046  

1047 ####################################################################### 

1048 # TOPOLOGY 

1049  

1050 def get_topology_nearest_gpus(self, level: GpuTopologyLevel | str) -> Iterable[Device]: 

1051 """ 

1052 Retrieve the GPUs that are nearest to this device at a specific interconnectivity level. 

1053  

1054 Supported on Linux only. 

1055  

1056 Parameters 

1057 ---------- 

1058 level: :class:`GpuTopologyLevel` 

1059 The topology level. 

1060  

1061 Returns 

1062 ------- 

1063 Iterable of :class:`Device` 

1064 The nearest devices at the given topology level. 

1065 """ 

1066 cdef Device device 

1067 try: 1o

1068 level = _GPU_TOPOLOGY_LEVEL_MAPPING[level] 1o

1069 except KeyError: 

1070 raise ValueError( 

1071 f"Invalid topology level: {level}. " 

1072 f"Must be one of {list(GpuTopologyLevel.__members__.values())}" 

1073 ) from None 

1074 for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): 1o

1075 device = Device.__new__(Device) 

1076 device._handle = handle 

1077 yield device 

1078  

1079 ####################################################################### 

1080 # UTILIZATION 

1081  

1082 @property 

1083 def utilization(self) -> Utilization: 

1084 """ 

1085 Retrieves the current :obj:`~Utilization` rates for the device's major 

1086 subsystems. 

1087  

1088 For Fermi™ or newer fully supported devices. 

1089  

1090 Note: During driver initialization when ECC is enabled one can see high 

1091 GPU and Memory Utilization readings. This is caused by ECC Memory 

1092 Scrubbing mechanism that is performed during driver initialization. 

1093  

1094 Note: On MIG-enabled GPUs, querying device utilization rates is not 

1095 currently supported. 

1096  

1097 Returns 

1098 ------- 

1099 Utilization 

1100 An object containing the current utilization rates for the device. 

1101 """ 

1102 return Utilization(nvml.device_get_utilization_rates(self._handle)) 1O

1103  

1104  

1105def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel: 

1106 """ 

1107 Retrieve the common ancestor for two devices. 

1108  

1109 For Linux only. 

1110  

1111 Parameters 

1112 ---------- 

1113 device1: :class:`Device` 

1114 The first device. 

1115 device2: :class:`Device` 

1116 The second device. 

1117  

1118 Returns 

1119 ------- 

1120 :class:`GpuTopologyLevel` 

1121 The common ancestor level of the two devices. 

1122 """ 

1123 return _GPU_TOPOLOGY_LEVEL_INV_MAPPING[ 

1124 nvml.device_get_topology_common_ancestor( 

1125 device1._handle, 

1126 device2._handle, 

1127 ) 

1128 ] 

1129  

1130  

1131def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex | str) -> GpuP2PStatus: 

1132 """ 

1133 Retrieve the P2P status between two devices. 

1134  

1135 Parameters 

1136 ---------- 

1137 device1: :class:`Device` 

1138 The first device. 

1139 device2: :class:`Device` 

1140 The second device. 

1141 index: :class:`GpuP2PCapsIndex` | str 

1142 The P2P capability index being looked for between ``device1`` and ``device2``. 

1143  

1144 Returns 

1145 ------- 

1146 :class:`GpuP2PStatus` 

1147 The P2P status between the two devices. 

1148 """ 

1149 try: 

1150 index_enum = _GPU_P2P_CAPS_INDEX_MAPPING[index] 

1151 except KeyError: 

1152 raise ValueError( 

1153 f"Invalid P2P caps index: {index}. " 

1154 f"Must be one of {list(GpuP2PCapsIndex.__members__.values())}" 

1155 ) from None 

1156 return _GPU_P2P_STATUS_MAPPING.get( 

1157 nvml.device_get_p2p_status( 

1158 device1._handle, 

1159 device2._handle, 

1160 index_enum, 

1161 ), 

1162 GpuP2PStatus.UNKNOWN 

1163 ) 

1164  

1165  

1166__all__ = [ 

1167 "Device", 

1168 "get_p2p_status", 

1169 "get_topology_common_ancestor", 

1170 "NvlinkInfo", 

1171]