Coverage for cuda/core/system/_device.pyx: 75.00%

256 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-13 01:38 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from libc.stdint cimport intptr_t, uint64_t 

6from libc.math cimport ceil 

7  

8from multiprocessing import cpu_count 

9from typing import Iterable, TYPE_CHECKING 

10import warnings 

11  

12from cuda.bindings import nvml 

13  

14from ._nvml_context cimport initialize 

15from cuda.core.system.typing import ( 

16 AddressingMode, 

17 AffinityScope, 

18 DeviceArch, 

19 ClockId, 

20 ClocksEventReasons, 

21 ClockType, 

22 CoolerControl, 

23 CoolerTarget, 

24 DeviceArch, 

25 EventType, 

26 FanControlPolicy, 

27 FieldId, 

28 GpuP2PCapsIndex, 

29 GpuP2PStatus, 

30 GpuTopologyLevel, 

31 InforomObject, 

32 TemperatureThresholds, 

33 ThermalController, 

34 ThermalTarget, 

35) 

36  

37if TYPE_CHECKING: 

38 import cuda.core # no-cython-lint 

39  

40  

41cdef object _pstate_to_int(object pstate): 

42 if pstate == nvml.Pstates.PSTATE_UNKNOWN: 1aec

43 return None 1ac

44 assert ( 1aec

45 int(pstate) >= 0 and int(pstate) <= 15 1aec

46 ), f"Invalid P-state: {pstate}. Must be between 0 and 15 inclusive, or PSTATE_UNKNOWN." 

47 return int(pstate) - int(nvml.Pstates.PSTATE_0) 1aec

48  

49  

50cdef int _pstate_to_enum(int pstate): 

51 if pstate < 0 or pstate > 15: 1e

52 raise ValueError(f"Invalid P-state: {pstate}. Must be between 0 and 15 inclusive.") 

53 return int(pstate) + int(nvml.Pstates.PSTATE_0) 1ae

54  

55  

56include "_clock.pxi" 

57include "_cooler.pxi" 

58include "_device_attributes.pxi" 

59include "_device_utils.pxi" 

60include "_event.pxi" 

61include "_fan.pxi" 

62include "_field_values.pxi" 

63include "_inforom.pxi" 

64include "_memory.pxi" 

65include "_mig.pxi" 

66include "_nvlink.pxi" 

67include "_pci_info.pxi" 

68include "_performance.pxi" 

69include "_process.pxi" 

70include "_repair_status.pxi" 

71include "_temperature.pxi" 

72include "_utilization.pxi" 

73  

74  

75_ADDRESSING_MODE_MAPPING = { 

76 nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_HMM: AddressingMode.HMM, 

77 nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_ATS: AddressingMode.ATS, 

78} 

79  

80  

81_AFFINITY_SCOPE_MAPPING = { 

82 AffinityScope.NODE: nvml.AffinityScope.NODE, 

83 AffinityScope.SOCKET: nvml.AffinityScope.SOCKET, 

84} 

85  

86  

87_BRAND_TYPE_MAPPING = { 

88 nvml.BrandType.BRAND_UNKNOWN: "Unknown", 

89 nvml.BrandType.BRAND_QUADRO: "Quadro", 

90 nvml.BrandType.BRAND_TESLA: "Tesla", 

91 nvml.BrandType.BRAND_NVS: "NVS", 

92 nvml.BrandType.BRAND_GRID: "GRID", 

93 nvml.BrandType.BRAND_GEFORCE: "GeForce", 

94 nvml.BrandType.BRAND_TITAN: "Titan", 

95 nvml.BrandType.BRAND_NVIDIA_VAPPS: "NVIDIA vApps", 

96 nvml.BrandType.BRAND_NVIDIA_VPC: "NVIDIA VPC", 

97 nvml.BrandType.BRAND_NVIDIA_VCS: "NVIDIA VCS", 

98 nvml.BrandType.BRAND_NVIDIA_VWS: "NVIDIA VWS", 

99 nvml.BrandType.BRAND_NVIDIA_CLOUD_GAMING: "NVIDIA Cloud Gaming", 

100 nvml.BrandType.BRAND_NVIDIA_VGAMING: "NVIDIA vGaming", 

101 nvml.BrandType.BRAND_QUADRO_RTX: "Quadro RTX", 

102 nvml.BrandType.BRAND_NVIDIA_RTX: "NVIDIA RTX", 

103 nvml.BrandType.BRAND_NVIDIA: "NVIDIA", 

104 nvml.BrandType.BRAND_GEFORCE_RTX: "GeForce RTX", 

105 nvml.BrandType.BRAND_TITAN_RTX: "Titan RTX", 

106} 

107  

108  

109_GPU_P2P_CAPS_INDEX_MAPPING = { 

110 GpuP2PCapsIndex.READ: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_READ, 

111 GpuP2PCapsIndex.WRITE: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_WRITE, 

112 GpuP2PCapsIndex.NVLINK: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_NVLINK, 

113 GpuP2PCapsIndex.ATOMICS: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_ATOMICS, 

114 GpuP2PCapsIndex.PCI: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PCI, 

115 GpuP2PCapsIndex.PROP: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PROP, 

116 GpuP2PCapsIndex.UNKNOWN: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_UNKNOWN, 

117} 

118  

119  

120_GPU_P2P_STATUS_MAPPING = { 

121 nvml.GpuP2PStatus.P2P_STATUS_OK: GpuP2PStatus.OK, 

122 nvml.GpuP2PStatus.P2P_STATUS_CHIPSET_NOT_SUPPORTED: GpuP2PStatus.CHIPSET_NOT_SUPPORTED, 

123 nvml.GpuP2PStatus.P2P_STATUS_GPU_NOT_SUPPORTED: GpuP2PStatus.GPU_NOT_SUPPORTED, 

124 nvml.GpuP2PStatus.P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED: GpuP2PStatus.IOH_TOPOLOGY_NOT_SUPPORTED, 

125 nvml.GpuP2PStatus.P2P_STATUS_DISABLED_BY_REGKEY: GpuP2PStatus.DISABLED_BY_REGKEY, 

126 nvml.GpuP2PStatus.P2P_STATUS_NOT_SUPPORTED: GpuP2PStatus.NOT_SUPPORTED, 

127 nvml.GpuP2PStatus.P2P_STATUS_UNKNOWN: GpuP2PStatus.UNKNOWN, 

128} 

129  

130  

131_GPU_TOPOLOGY_LEVEL_MAPPING = { 

132 GpuTopologyLevel.INTERNAL: nvml.GpuTopologyLevel.TOPOLOGY_INTERNAL, 

133 GpuTopologyLevel.SINGLE: nvml.GpuTopologyLevel.TOPOLOGY_SINGLE, 

134 GpuTopologyLevel.MULTIPLE: nvml.GpuTopologyLevel.TOPOLOGY_MULTIPLE, 

135 GpuTopologyLevel.HOSTBRIDGE: nvml.GpuTopologyLevel.TOPOLOGY_HOSTBRIDGE, 

136 GpuTopologyLevel.NODE: nvml.GpuTopologyLevel.TOPOLOGY_NODE, 

137 GpuTopologyLevel.SYSTEM: nvml.GpuTopologyLevel.TOPOLOGY_SYSTEM, 

138} 

139  

140  

141_GPU_TOPOLOGY_LEVEL_INV_MAPPING = {v: k for k, v in _GPU_TOPOLOGY_LEVEL_MAPPING.items()} 

142  

143  

144  

145cdef class Device: 

146 """ 

147 Representation of a device. 

148  

149 :class:`cuda.core.system.Device` provides access to various pieces of metadata 

150 about devices and their topology, as provided by the NVIDIA Management 

151 Library (NVML). To use CUDA with a device, use :class:`cuda.core.Device`. 

152  

153 Creating a device instance causes NVML to initialize the target GPU. 

154 NVML may initialize additional GPUs if the target GPU is an SLI slave. 

155  

156 Parameters 

157 ---------- 

158 index: int, optional 

159 Integer representing the CUDA device index to get a handle to. Valid 

160 values are between ``0`` and ``cuda.core.system.get_num_devices() - 1``. 

161  

162 The order in which devices are enumerated has no guarantees of 

163 consistency between reboots. For that reason, it is recommended that 

164 devices are looked up by their PCI ids or UUID. 

165  

166 uuid: bytes or str, optional 

167 UUID of a CUDA device to get a handle to. 

168  

169 pci_bus_id: bytes or str, optional 

170 PCI bus ID of a CUDA device to get a handle to. 

171  

172 Raises 

173 ------ 

174 ValueError 

175 If anything other than a single `index`, `uuid` or `pci_bus_id` are specified. 

176 """ 

177  

178 # This is made public for testing purposes only 

179 cdef public intptr_t _handle 

180  

181 def __init__( 

182 self, 

183 *, 

184 index: int | None = None, 

185 uuid: bytes | str | None = None, 

186 pci_bus_id: bytes | str | None = None, 

187 ) -> None: 

188 args = [index, uuid, pci_bus_id] 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

189 cdef int arg_count = sum(arg is not None for arg in args) 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

190  

191 if arg_count > 1: 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

192 raise ValueError("Handle requires only one of `index`, `uuid`, or `pci_bus_id`.") 

193 if arg_count == 0: 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

194 raise ValueError("Handle requires either a device `index`, `uuid`, or `pci_bus_id`.") 

195  

196 initialize() 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

197  

198 if index is not None: 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

199 self._handle = nvml.device_get_handle_by_index_v2(index) 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ

200 elif uuid is not None: 1fp

201 if isinstance(uuid, bytes): 1p

202 uuid = uuid.decode("ascii") 

203 self._handle = nvml.device_get_handle_by_uuid(uuid) 1p

204 elif pci_bus_id is not None: 1f

205 if isinstance(pci_bus_id, bytes): 1f

206 pci_bus_id = pci_bus_id.decode("ascii") 

207 self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) 1f

208  

209 ######################################################################### 

210 # BASIC PROPERTIES 

211  

212 @property 

213 def index(self) -> int: 

214 """ 

215 The NVML index of this device. 

216  

217 Valid indices are derived from the count returned by 

218 :meth:`Device.get_device_count`. For example, if ``get_device_count()`` 

219 returns 2, the valid indices are 0 and 1, corresponding to GPU 0 and GPU 

220 1. 

221  

222 The order in which NVML enumerates devices has no guarantees of 

223 consistency between reboots. For that reason, it is recommended that 

224 devices be looked up by their PCI ids or GPU UUID. 

225  

226 Note: The NVML index may not correlate with other APIs, such as the CUDA 

227 device index. 

228 """ 

229 return nvml.device_get_index(self._handle) 1fI

230  

231 @property 

232 def uuid(self) -> str: 

233 """ 

234 Retrieves the globally unique immutable UUID associated with this 

235 device, as a 5 part hexadecimal string, that augments the immutable, 

236 board serial identifier. 

237  

238 In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-`` 

239 prefix. If you need a `uuid` without that prefix (for example, to 

240 interact with CUDA), use the `uuid_without_prefix` property. 

241 """ 

242 return nvml.device_get_uuid(self._handle) 1P

243  

244 @property 

245 def uuid_without_prefix(self) -> str: 

246 """ 

247 Retrieves the globally unique immutable UUID associated with this 

248 device, as a 5 part hexadecimal string, that augments the immutable, 

249 board serial identifier. 

250  

251 In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-`` 

252 prefix. This property returns it without the prefix, to match the UUIDs 

253 used in CUDA. If you need the prefix, use the `uuid` property. 

254 """ 

255 # NVML UUIDs have a `gpu-` or `mig-` prefix. We remove that here. 

256 return nvml.device_get_uuid(self._handle)[4:] 1Ehp

257  

258 @property 

259 def pci_bus_id(self) -> str: 

260 """ 

261 Retrieves the PCI bus ID of this device. 

262 """ 

263 return self.pci_info.bus_id 

264  

265 @property 

266 def numa_node_id(self) -> int: 

267 """ 

268 The NUMA node of the given GPU device. 

269  

270 This only applies to platforms where the GPUs are NUMA nodes. 

271 """ 

272 return nvml.device_get_numa_node_id(self._handle) 1L

273  

274 @property 

275 def arch(self) -> DeviceArch: 

276 """ 

277 :obj:`~DeviceArch` device architecture. 

278  

279 For example, a Tesla V100 will report ``DeviceArchitecture.name == 

280 "VOLTA"``, and RTX A6000 will report ``DeviceArchitecture.name == 

281 "AMPERE"``. 

282 """ 

283 arch = nvml.device_get_architecture(self._handle) 1ln

284 try: 1ln

285 return DeviceArch(arch) 1ln

286 except ValueError: 

287 return DeviceArch.UNKNOWN 

288  

289 @property 

290 def name(self) -> str: 

291 """ 

292 Name of the device, e.g.: `"Tesla V100-SXM2-32GB"` 

293 """ 

294 return nvml.device_get_name(self._handle) 1C

295  

296 @property 

297 def brand(self) -> str: 

298 """ 

299 The brand of the device. 

300  

301 Returns "Unknown" if the brand is unknown. 

302 """ 

303 return _BRAND_TYPE_MAPPING.get(nvml.device_get_brand(self._handle), "Unknown") 1z

304  

305 @property 

306 def serial(self) -> str: 

307 """ 

308 Retrieves the globally unique board serial number associated with this 

309 device's board. 

310  

311 For all products with an InfoROM. 

312 """ 

313 return nvml.device_get_serial(self._handle) 1D

314  

315 @property 

316 def module_id(self) -> int: 

317 """ 

318 Get a unique identifier for the device module on the baseboard. 

319  

320 This API retrieves a unique identifier for each GPU module that exists 

321 on a given baseboard. For non-baseboard products, this ID would always 

322 be 0. 

323 """ 

324 return nvml.device_get_module_id(self._handle) 1K

325  

326 @property 

327 def minor_number(self) -> int: 

328 """ 

329 The minor number of this device. 

330  

331 For Linux only. 

332  

333 The minor number is used by the Linux device driver to identify the 

334 device node in ``/dev/nvidiaX``. 

335 """ 

336 return nvml.device_get_minor_number(self._handle) 1H

337  

338 @property 

339 def is_c2c_enabled(self) -> bool: 

340 """ 

341 Whether the C2C (Chip-to-Chip) mode is enabled for this device. 

342 """ 

343 return bool(nvml.device_get_c2c_mode_info_v(self._handle).is_c2c_enabled) 1v

344  

345 @property 

346 def is_persistence_mode_enabled(self) -> bool: 

347 """ 

348 Whether persistence mode is enabled for this device. 

349  

350 For Linux only. 

351 """ 

352 return nvml.device_get_persistence_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED 1k

353  

354 @is_persistence_mode_enabled.setter 

355 def is_persistence_mode_enabled(self, enabled: bool) -> None: 

356 nvml.device_set_persistence_mode( 1k

357 self._handle, 1k

358 nvml.EnableState.FEATURE_ENABLED if enabled else nvml.EnableState.FEATURE_DISABLED 1k

359 ) 

360  

361 @property 

362 def cuda_compute_capability(self) -> tuple[int, int]: 

363 """ 

364 CUDA compute capability of the device, e.g.: `(7, 0)` for a Tesla V100. 

365  

366 Returns a tuple `(major, minor)`. 

367 """ 

368 return nvml.device_get_cuda_compute_capability(self._handle) 1A

369  

370 def to_cuda_device(self) -> "cuda.core.Device": 

371 """ 

372 Get the corresponding :class:`cuda.core.Device` (which is used for CUDA 

373 access) for this :class:`cuda.core.system.Device` (which is used for 

374 NVIDIA machine library (NVML) access). 

375  

376 The devices are mapped to one another by their UUID. 

377  

378 Returns 

379 ------- 

380 cuda.core.Device 

381 The corresponding CUDA device. 

382  

383 Raises 

384 ------ 

385 RuntimeError 

386 No corresponding CUDA device is found for this NVML device. 

387  

388 For example, on a MIG system, the physical GPU will not have an 

389 available CUDA device, since it can not be used directly, even 

390 though it can be enumerated from NVML. 

391 """ 

392 from cuda.core import Device as CudaDevice 1h

393  

394 # CUDA does not have an API to get a device by its UUID, so we just 

395 # search all the devices for one with a matching UUID. 

396  

397 for cuda_device in CudaDevice.get_all_devices(): 1h

398 if cuda_device.uuid == self.uuid_without_prefix: 1h

399 return cuda_device 1h

400  

401 raise RuntimeError("No corresponding CUDA device found for this NVML device.") 

402  

403 @classmethod 

404 def get_device_count(cls) -> int: 

405 """ 

406 Get the number of available devices. 

407  

408 Returns 

409 ------- 

410 int 

411 The number of available devices. 

412 """ 

413 initialize() 1STU

414  

415 return nvml.device_get_count_v2() 1STU

416  

417 @classmethod 

418 def get_all_devices(cls) -> Iterable[Device]: 

419 """ 

420 Query the available device instances. 

421  

422 Returns 

423 ------- 

424 Iterator over :obj:`~Device` 

425 An iterator over available devices. 

426 """ 

427 initialize() 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ

428  

429 for device_id in range(nvml.device_get_count_v2()): 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ

430 yield cls(index=device_id) 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ

431  

432 ######################################################################### 

433 # ADDRESSING MODE 

434  

435 @property 

436 def addressing_mode(self) -> AddressingMode | None: 

437 """ 

438 Get the :obj:`~AddressingMode` of the device. 

439 """ 

440 return _ADDRESSING_MODE_MAPPING.get(nvml.device_get_addressing_mode(self._handle).value, None) 1t

441  

442 ######################################################################### 

443 # MIG (MULTI-INSTANCE GPU) DEVICES 

444  

445 @property 

446 def mig(self) -> MigInfo: 

447 """ 

448 Get :obj:`~MigInfo` accessor for MIG (Multi-Instance GPU) information. 

449  

450 For Ampere™ or newer fully supported devices. 

451 """ 

452 return MigInfo(self) 1qJ

453  

454 ######################################################################### 

455 # AFFINITY 

456  

457 @classmethod 

458 def get_all_devices_with_cpu_affinity(cls, cpu_index: int) -> Iterable[Device]: 

459 """ 

460 Retrieve the set of GPUs that have a CPU affinity with the given CPU number. 

461  

462 Supported on Linux only. 

463  

464 Parameters 

465 ---------- 

466 cpu_index: int 

467 The CPU index. 

468  

469 Returns 

470 ------- 

471 Iterator of :obj:`~Device` 

472 An iterator over available devices. 

473 """ 

474 cdef Device device 

475 for handle in nvml.system_get_topology_gpu_set(cpu_index): 1R

476 device = Device.__new__(Device) 1R

477 device._handle = handle 1R

478 yield device 1R

479  

480 def get_memory_affinity(self, scope: AffinityScope | str=AffinityScope.NODE) -> list[int]: 

481 """ 

482 Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal 

483 memory affinity for the device. 

484  

485 For Kepler™ or newer fully supported devices. 

486  

487 Supported on Linux only. 

488  

489 If requested scope is not applicable to the target topology, the API 

490 will fall back to reporting the memory affinity for the immediate non-I/O 

491 ancestor of the device. 

492  

493 Parameters 

494 ---------- 

495 scope: AffinityScope | str, optional 

496 The scope of the affinity query. Must be one of the values of 

497 :class:`AffinityScope`. Default is :attr:`AffinityScope.NODE`. 

498  

499 Returns 

500 ------- 

501 list[int] 

502 A list of indices of NUMA nodes or CPU sockets with the ideal memory 

503 affinity for the device. 

504 """ 

505 try: 1b

506 scope = _AFFINITY_SCOPE_MAPPING[scope] 1b

507 except KeyError: 

508 raise ValueError( 

509 f"Invalid affinity scope: {scope}. " 

510 f"Must be one of {list(AffinityScope.__members__.values())}" 

511 ) from None 

512 return _unpack_bitmask( 1b

513 nvml.device_get_memory_affinity( 1b

514 self._handle, 1b

515 <unsigned int>ceil(cpu_count() / 64), 1b

516 scope, 1b

517 ) 

518 ) 

519  

520 def get_cpu_affinity(self, scope: AffinityScope | str=AffinityScope.NODE) -> list[int]: 

521 """ 

522 Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal 

523 CPU affinity for the device. 

524  

525 For Kepler™ or newer fully supported devices. 

526  

527 Supported on Linux only. 

528  

529 If requested scope is not applicable to the target topology, the API 

530 will fall back to reporting the memory affinity for the immediate non-I/O 

531 ancestor of the device. 

532  

533 Parameters 

534 ---------- 

535 scope: AffinityScope | str, optional 

536 The scope of the affinity query. Must be one of the values of 

537 :class:`AffinityScope`. Default is :attr:`AffinityScope.NODE`. 

538  

539 Returns 

540 ------- 

541 list[int] 

542 A list of indices of NUMA nodes or CPU sockets with the ideal memory 

543 affinity for the device. 

544 """ 

545 try: 1biR

546 scope = _AFFINITY_SCOPE_MAPPING[scope] 1biR

547 except KeyError: 

548 raise ValueError( 

549 f"Invalid affinity scope: {scope}. " 

550 f"Must be one of {list(AffinityScope.__members__.values())}" 

551 ) from None 

552 return _unpack_bitmask( 1biR

553 nvml.device_get_cpu_affinity_within_scope( 1biR

554 self._handle, 1biR

555 <unsigned int>ceil(cpu_count() / 64), 1biR

556 scope, 1biR

557 ) 

558 ) 

559  

560 def set_cpu_affinity(self) -> None: 

561 """ 

562 Sets the ideal affinity for the calling thread and device. 

563  

564 For Kepler™ or newer fully supported devices. 

565  

566 Supported on Linux only. 

567 """ 

568 nvml.device_set_cpu_affinity(self._handle) 

569  

570 def clear_cpu_affinity(self) -> None: 

571 """ 

572 Clear all affinity bindings for the calling thread. 

573  

574 For Kepler™ or newer fully supported devices. 

575  

576 Supported on Linux only. 

577 """ 

578 nvml.device_clear_cpu_affinity(self._handle) 

579  

580 ######################################################################### 

581 # CLOCK 

582 # See external class definitions in _clock.pxi 

583  

584 def get_clock(self, clock_type: ClockType | str) -> ClockInfo: 

585 """ 

586 :obj:`~_device.ClockInfo` object to get information about and manage a specific clock on a device. 

587 """ 

588 return ClockInfo(self._handle, clock_type) 1e

589  

590 @property 

591 def is_auto_boosted_clocks_enabled(self) -> tuple[bool, bool]: 

592 """ 

593 Retrieve the current state of auto boosted clocks on a device. 

594  

595 For Kepler™ or newer fully supported devices. 

596  

597 Auto Boosted clocks are enabled by default on some hardware, allowing 

598 the GPU to run at higher clock rates to maximize performance as thermal 

599 limits allow. 

600  

601 On Pascal™ and newer hardware, Auto Boosted clocks are controlled 

602 through application clocks. Use :meth:`set_application_clocks` and 

603 :meth:`reset_application_clocks` to control Auto Boost behavior. 

604  

605 Returns 

606 ------- 

607 bool 

608 The current state of Auto Boosted clocks 

609 bool 

610 The default Auto Boosted clocks behavior 

611  

612 """ 

613 current, default = nvml.device_get_auto_boosted_clocks_enabled(self._handle) 1u

614 return current == nvml.EnableState.FEATURE_ENABLED, default == nvml.EnableState.FEATURE_ENABLED 

615  

616 @property 

617 def current_clock_event_reasons(self) -> list[ClocksEventReasons]: 

618 """ 

619 Retrieves the current :obj:`~ClocksEventReasons`. 

620  

621 For all fully supported products. 

622 """ 

623 cdef uint64_t[1] reasons 

624 reasons[0] = nvml.device_get_current_clocks_event_reasons(self._handle) 1d

625 output_reasons = [] 1d

626 for reason in _unpack_bitmask(reasons): 1d

627 try: 

628 output_reason = _CLOCKS_EVENT_REASONS_MAPPING[1 << reason] 

629 except KeyError: 

630 raise ValueError(f"Unknown clock event reason bit: {1 << reason}") 

631 output_reasons.append(output_reason) 

632 return output_reasons 1d

633  

634 @property 

635 def supported_clock_event_reasons(self) -> list[ClocksEventReasons]: 

636 """ 

637 Retrieves supported :obj:`~ClocksEventReasons` that can be returned by 

638 :meth:`get_current_clock_event_reasons`. 

639  

640 For all fully supported products. 

641  

642 This method is not supported in virtual machines running virtual GPU (vGPU). 

643 """ 

644 cdef uint64_t[1] reasons 

645 reasons[0] = nvml.device_get_supported_clocks_event_reasons(self._handle) 1d

646 output_reasons = [] 1d

647 for reason in _unpack_bitmask(reasons): 1d

648 try: 1d

649 output_reason = _CLOCKS_EVENT_REASONS_MAPPING[1 << reason] 1d

650 except KeyError: 

651 raise ValueError(f"Unknown clock event reason bit: {1 << reason}") 

652 output_reasons.append(output_reason) 1d

653 return output_reasons 1d

654  

655 ########################################################################## 

656 # COOLER 

657 # See external class definitions in _cooler.pxi 

658  

659 @property 

660 def cooler(self) -> CoolerInfo: 

661 """ 

662 :obj:`~_device.CoolerInfo` object with cooler information for the device. 

663 """ 

664 return CoolerInfo(nvml.device_get_cooler_info(self._handle)) 

665  

666 ########################################################################## 

667 # DEVICE ATTRIBUTES 

668 # See external class definitions in _device_attributes.pxi 

669  

670 @property 

671 def attributes(self) -> DeviceAttributes: 

672 """ 

673 :obj:`~_device.DeviceAttributes` object with various device attributes. 

674  

675 For Ampere™ or newer fully supported devices. Only available on Linux 

676 systems. 

677 """ 

678 return DeviceAttributes(nvml.device_get_attributes_v2(self._handle)) 1x

679  

680 ######################################################################### 

681 # DISPLAY 

682  

683 @property 

684 def is_display_connected(self) -> bool: 

685 """ 

686 The display mode for this device. 

687  

688 Indicates whether a physical display (e.g. monitor) is currently connected to 

689 any of the device's connectors. 

690 """ 

691 return nvml.device_get_display_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED 1r

692  

693 @property 

694 def is_display_active(self) -> bool: 

695 """ 

696 The display active status for this device. 

697  

698 Indicates whether a display is initialized on the device. For example, 

699 whether X Server is attached to this device and has allocated memory for 

700 the screen. 

701  

702 Display can be active even when no monitor is physically attached. 

703 """ 

704 return nvml.device_get_display_active(self._handle) == nvml.EnableState.FEATURE_ENABLED 1r

705  

706 ########################################################################## 

707 # EVENTS 

708 # See external class definitions in _event.pxi 

709  

710 def register_events(self, events: EventType | str | list[EventType | str]) -> DeviceEvents: 

711 """ 

712 Starts recording events on this device. 

713  

714 For Fermi™ or newer fully supported devices. For Linux only. 

715  

716 ECC events are available only on ECC-enabled devices (see 

717 :meth:`Device.get_total_ecc_errors`). Power capping events are 

718 available only on Power Management enabled devices (see 

719 :meth:`Device.get_power_management_mode`). 

720  

721 This call starts recording of events on specific device. All events 

722 that occurred before this call are not recorded. Wait for events using 

723 the :meth:`DeviceEvents.wait` method on the result. 

724  

725 Examples 

726 -------- 

727 >>> device = Device(index=0) 

728 >>> events = device.register_events([ 

729 ... EventType.XID_CRITICAL_ERROR, 

730 ... ]) 

731 >>> while event := events.wait(timeout_ms=10000): 

732 ... print(f"Event {event.event_type} occurred on device {event.device.uuid}") 

733  

734 Parameters 

735 ---------- 

736 events: EventType, str, or list of EventType or str 

737 The event type or list of event types to register for this device. 

738  

739 Returns 

740 ------- 

741 :obj:`~_device.DeviceEvents` 

742 An object representing the registered events. Call 

743 :meth:`~_device.DeviceEvents.wait` on this object to wait for events. 

744  

745 Raises 

746 ------ 

747 :class:`cuda.core.system.NotSupportedError` 

748 None of the requested event types are registered. 

749 """ 

750 return DeviceEvents(self._handle, events) 1g

751  

752 def get_supported_event_types(self) -> list[EventType]: 

753 """ 

754 Get the list of event types supported by this device. 

755  

756 For Fermi™ or newer fully supported devices. For Linux only (returns an 

757 empty list on Windows). 

758  

759 Returns 

760 ------- 

761 list[EventType] 

762 The list of supported event types. 

763 """ 

764 cdef uint64_t[1] bitmask 

765 bitmask[0] = nvml.device_get_supported_event_types(self._handle) 1g

766 events = [] 1g

767 for ev in _unpack_bitmask(bitmask): 1g

768 try: 1g

769 ev_enum = _EVENT_TYPE_MAPPING[1 << ev] 1g

770 except KeyError: 

771 raise ValueError(f"Unknown event type bit: {1 << ev}") 

772 events.append(ev_enum) 1g

773 return events 1g

774  

775 ########################################################################## 

776 # FAN 

777 # See external class definitions in _fan.pxi 

778  

779 def get_fan(self, fan: int = 0) -> FanInfo: 

780 """ 

781 :obj:`~_device.FanInfo` object to get information and manage a specific fan on a device. 

782 """ 

783 if fan < 0 or fan >= self.num_fans: 

784 raise ValueError(f"Fan index {fan} is out of range [0, {self.num_fans})") 

785 return FanInfo(self._handle, fan) 

786  

787 @property 

788 def num_fans(self) -> int: 

789 """ 

790 The number of fans on the device. 

791 """ 

792 return nvml.device_get_num_fans(self._handle) 1wF

793  

794 ########################################################################## 

795 # FIELD VALUES 

796 # See external class definitions in _field_values.pxi 

797  

798 def get_field_values(self, field_ids: list[int | tuple[int, int]]) -> FieldValues: 

799 """ 

800 Get multiple field values from the device. 

801  

802 Each value specified can raise its own exception. That exception will 

803 be raised when attempting to access the corresponding ``value`` from the 

804 returned :obj:`~_device.FieldValues` container. 

805  

806 To confirm that there are no exceptions in the entire container, call 

807 :meth:`~_device.FieldValues.validate`. 

808  

809 Parameters 

810 ---------- 

811 field_ids: list[int | tuple[int, int]] 

812 List of field IDs to query. 

813  

814 Each item may be either a single value from the :class:`FieldId` 

815 enum, or a pair of (:class:`FieldId`, scope ID). 

816  

817 Returns 

818 ------- 

819 :obj:`~_device.FieldValues` 

820 Container of field values corresponding to the requested field IDs. 

821 """ 

822 # Passing a field_ids array of length 0 raises an InvalidArgumentError, 

823 # so avoid that. 

824 if len(field_ids) == 0: 1j

825 return FieldValues(nvml.FieldValue(0)) 1j

826  

827 return FieldValues(nvml.device_get_field_values(self._handle, field_ids)) 1j

828  

829 def clear_field_values(self, field_ids: list[int | tuple[int, int]]) -> None: 

830 """ 

831 Clear multiple field values from the device. 

832  

833 Parameters 

834 ---------- 

835 field_ids: list[int | tuple[int, int]] 

836 List of field IDs to clear. 

837  

838 Each item may be either a single value from the :class:`FieldId` 

839 enum, or a pair of (:class:`FieldId`, scope ID). 

840 """ 

841 # Passing a field_ids array of length 0 raises an InvalidArgumentError, 

842 # so avoid that. 

843 if len(field_ids) == 0: 1j

844 return 

845  

846 nvml.device_clear_field_values(self._handle, field_ids) 1j

847  

848 ########################################################################## 

849 # INFOROM 

850 # See external class definitions in _inforom.pxi 

851  

852 @property 

853 def inforom(self) -> InforomInfo: 

854 """ 

855 :obj:`~_device.InforomInfo` object with InfoROM information. 

856  

857 For all products with an InfoROM. 

858 """ 

859 return InforomInfo(self) 1G

860  

861 ########################################################################## 

862 # MEMORY 

863 # See external class definitions in _memory.pxi 

864  

865 @property 

866 def bar1_memory_info(self) -> BAR1MemoryInfo: 

867 """ 

868 :obj:`~_device.BAR1MemoryInfo` object with BAR1 memory information. 

869  

870 BAR1 is used to map the FB (device memory) so that it can be directly 

871 accessed by the CPU or by 3rd party devices (peer-to-peer on the PCIE 

872 bus). 

873 """ 

874 return BAR1MemoryInfo(nvml.device_get_bar1_memory_info(self._handle)) 1y

875  

876 @property 

877 def memory_info(self) -> MemoryInfo: 

878 """ 

879 :obj:`~_device.MemoryInfo` object with memory information. 

880 """ 

881 return MemoryInfo(nvml.device_get_memory_info_v2(self._handle)) 1B

882  

883 ########################################################################## 

884 # NVLINK 

885 # See external class definitions in _nvlink.pxi 

886  

887 def get_nvlink(self, link: int) -> NvlinkInfo: 

888 """ 

889 Get :obj:`~NvlinkInfo` about this device. 

890  

891 For devices with NVLink support. 

892 """ 

893 if link < 0 or link >= NvlinkInfo.max_links: 1s

894 raise ValueError(f"Link index {link} is out of range [0, {NvlinkInfo.max_links})") 

895 return NvlinkInfo(self, link) 1s

896  

897 ########################################################################## 

898 # PCI INFO 

899 # See external class definitions in _pci_info.pxi 

900  

901 @property 

902 def pci_info(self) -> PciInfo: 

903 """ 

904 :obj:`~_device.PciInfo` object with the PCI attributes of this device. 

905  

906 Non-physical devices, such as MIG devices, may not have PCI attributes. 

907 In that case, this property will raise a `RuntimeError`. 

908 """ 

909 try: 1fmhp

910 pci_info = nvml.device_get_pci_info_ext(self._handle) 1fmhp

911 except nvml.InvalidArgumentError: 

912 raise RuntimeError("This device does not have PCI attributes") from None 

913 else: 

914 return PciInfo(pci_info, self._handle) 1fmhp

915  

916 ########################################################################## 

917 # PERFORMANCE 

918 # See external class definitions in _performance.pxi 

919  

920 @property 

921 def performance_state(self) -> int | None: 

922 """ 

923 The current performance state of the device. 

924  

925 For Fermi™ or newer fully supported devices. 

926  

927 Returns 

928 ------- 

929 int | None 

930 The current performance state of the device, as an integer between 0 and 15, 

931 where 0 is maximum performance and higher numbers are lower performance. 

932 Returns `None` if the performance state is unknown. 

933 """ 

934 return _pstate_to_int(nvml.device_get_performance_state(self._handle)) 1ec

935  

936 @property 

937 def dynamic_pstates_info(self) -> GpuDynamicPstatesInfo: 

938 """ 

939 :obj:`~_device.GpuDynamicPstatesInfo` object with performance monitor samples from the associated subdevice. 

940 """ 

941 return GpuDynamicPstatesInfo(nvml.device_get_dynamic_pstates_info(self._handle)) 1c

942  

943 @property 

944 def supported_pstates(self) -> list[int]: 

945 """ 

946 Get all supported Performance States (P-States) for the device. 

947  

948 The returned list contains a contiguous list of valid P-States supported by 

949 the device. 

950  

951 Return 

952 ------ 

953 list[int] 

954 A list of supported performance state of the device, as an integer 

955 between 0 and 15, where 0 is maximum performance and higher numbers 

956 are lower performance. 

957 """ 

958 # From nvml.h: 

959 # The returned array would contain a contiguous list of valid P-States 

960 # supported by the device. If the number of supported P-States is fewer 

961 # than the size of the array supplied missing elements would contain \a 

962 # NVML_PSTATE_UNKNOWN. 

963  

964 pstates = [] 1c

965 for pstate in nvml.device_get_supported_performance_states(self._handle): 1c

966 pstate_value = _pstate_to_int(pstate) 1c

967 if pstate_value is not None: 1c

968 pstates.append(pstate_value) 1c

969 return pstates 1c

970  

971 ########################################################################## 

972 # PROCESS 

973 # See external class definitions in _process.pxi 

974  

975 @property 

976 def compute_running_processes(self) -> list[ProcessInfo]: 

977 """ 

978 Get information about processes with a compute context on a device 

979  

980 For Fermi™ or newer fully supported devices. 

981  

982 This function returns information only about compute running processes 

983 (e.g. CUDA application which have active context). Any graphics 

984 applications (e.g. using OpenGL, DirectX) won't be listed by this 

985 function. 

986  

987 Keep in mind that information returned by this call is dynamic and the 

988 number of elements might change in time. 

989  

990 In MIG mode, if device handle is provided, the API returns aggregate 

991 information, only if the caller has appropriate privileges. Per-instance 

992 information can be queried by using specific MIG device handles. 

993 Querying per-instance information using MIG device handles is not 

994 supported if the device is in vGPU Host virtualization mode. 

995 """ 

996 return [ProcessInfo(self, proc) for proc in nvml.device_get_compute_running_processes_v3(self._handle)] 1qQ

997  

998 ########################################################################## 

999 # REPAIR STATUS 

1000 # See external class definitions in _repair_status.pxi 

1001  

1002 @property 

1003 def repair_status(self) -> RepairStatus: 

1004 """ 

1005 :obj:`~_device.RepairStatus` object with TPC/Channel repair status. 

1006  

1007 For Ampere™ or newer fully supported devices. 

1008 """ 

1009 return RepairStatus(self._handle) 1M

1010  

1011 ########################################################################## 

1012 # TEMPERATURE 

1013 # See external class definitions in _temperature.pxi 

1014  

1015 @property 

1016 def temperature(self) -> Temperature: 

1017 """ 

1018 :obj:`~_device.Temperature` object with temperature information for the device. 

1019 """ 

1020 return Temperature(self._handle) 1N

1021  

1022 ####################################################################### 

1023 # TOPOLOGY 

1024  

1025 def get_topology_nearest_gpus(self, level: GpuTopologyLevel | str) -> Iterable[Device]: 

1026 """ 

1027 Retrieve the GPUs that are nearest to this device at a specific interconnectivity level. 

1028  

1029 Supported on Linux only. 

1030  

1031 Parameters 

1032 ---------- 

1033 level: :class:`GpuTopologyLevel` 

1034 The topology level. 

1035  

1036 Returns 

1037 ------- 

1038 Iterable of :class:`Device` 

1039 The nearest devices at the given topology level. 

1040 """ 

1041 cdef Device device 

1042 try: 1o

1043 level = _GPU_TOPOLOGY_LEVEL_MAPPING[level] 1o

1044 except KeyError: 

1045 raise ValueError( 

1046 f"Invalid topology level: {level}. " 

1047 f"Must be one of {list(GpuTopologyLevel.__members__.values())}" 

1048 ) from None 

1049 for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): 1o

1050 device = Device.__new__(Device) 

1051 device._handle = handle 

1052 yield device 

1053  

1054 ####################################################################### 

1055 # UTILIZATION 

1056  

1057 @property 

1058 def utilization(self) -> Utilization: 

1059 """ 

1060 Retrieves the current :obj:`~Utilization` rates for the device's major 

1061 subsystems. 

1062  

1063 For Fermi™ or newer fully supported devices. 

1064  

1065 Note: During driver initialization when ECC is enabled one can see high 

1066 GPU and Memory Utilization readings. This is caused by ECC Memory 

1067 Scrubbing mechanism that is performed during driver initialization. 

1068  

1069 Note: On MIG-enabled GPUs, querying device utilization rates is not 

1070 currently supported. 

1071  

1072 Returns 

1073 ------- 

1074 Utilization 

1075 An object containing the current utilization rates for the device. 

1076 """ 

1077 return Utilization(nvml.device_get_utilization_rates(self._handle)) 1O

1078  

1079  

1080def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel: 

1081 """ 

1082 Retrieve the common ancestor for two devices. 

1083  

1084 For Linux only. 

1085  

1086 Parameters 

1087 ---------- 

1088 device1: :class:`Device` 

1089 The first device. 

1090 device2: :class:`Device` 

1091 The second device. 

1092  

1093 Returns 

1094 ------- 

1095 :class:`GpuTopologyLevel` 

1096 The common ancestor level of the two devices. 

1097 """ 

1098 return _GPU_TOPOLOGY_LEVEL_INV_MAPPING[ 

1099 nvml.device_get_topology_common_ancestor( 

1100 device1._handle, 

1101 device2._handle, 

1102 ) 

1103 ] 

1104  

1105  

1106def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex | str) -> GpuP2PStatus: 

1107 """ 

1108 Retrieve the P2P status between two devices. 

1109  

1110 Parameters 

1111 ---------- 

1112 device1: :class:`Device` 

1113 The first device. 

1114 device2: :class:`Device` 

1115 The second device. 

1116 index: :class:`GpuP2PCapsIndex` | str 

1117 The P2P capability index being looked for between ``device1`` and ``device2``. 

1118  

1119 Returns 

1120 ------- 

1121 :class:`GpuP2PStatus` 

1122 The P2P status between the two devices. 

1123 """ 

1124 try: 

1125 index_enum = _GPU_P2P_CAPS_INDEX_MAPPING[index] 

1126 except KeyError: 

1127 raise ValueError( 

1128 f"Invalid P2P caps index: {index}. " 

1129 f"Must be one of {list(GpuP2PCapsIndex.__members__.values())}" 

1130 ) from None 

1131 return _GPU_P2P_STATUS_MAPPING.get( 

1132 nvml.device_get_p2p_status( 

1133 device1._handle, 

1134 device2._handle, 

1135 index_enum, 

1136 ), 

1137 GpuP2PStatus.UNKNOWN 

1138 ) 

1139  

1140  

1141__all__ = [ 

1142 "Device", 

1143 "get_p2p_status", 

1144 "get_topology_common_ancestor", 

1145 "NvlinkInfo", 

1146]