Coverage for cuda / core / system / _device.pyx: 75.29%
255 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-22 01:37 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-22 01:37 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
5from libc.stdint cimport intptr_t, uint64_t
6from libc.math cimport ceil
8from multiprocessing import cpu_count
9from typing import Iterable
10import warnings
12from cuda.bindings import nvml
14from ._nvml_context cimport initialize
15from cuda.core.system.typing import (
16 AddressingMode,
17 AffinityScope,
18 DeviceArch,
19 ClockId,
20 ClocksEventReasons,
21 ClockType,
22 CoolerControl,
23 CoolerTarget,
24 DeviceArch,
25 EventType,
26 FanControlPolicy,
27 FieldId,
28 GpuP2PCapsIndex,
29 GpuP2PStatus,
30 GpuTopologyLevel,
31 InforomObject,
32 TemperatureThresholds,
33 ThermalController,
34 ThermalTarget,
35)
38cdef object _pstate_to_int(object pstate):
39 if pstate == nvml.Pstates.PSTATE_UNKNOWN: 1aec
40 return None 1c
41 assert ( 1aec
42 int(pstate) >= 0 and int(pstate) <= 15 1aec
43 ), f"Invalid P-state: {pstate}. Must be between 0 and 15 inclusive, or PSTATE_UNKNOWN."
44 return int(pstate) - int(nvml.Pstates.PSTATE_0) 1aec
47cdef int _pstate_to_enum(int pstate):
48 if pstate < 0 or pstate > 15: 1ae
49 raise ValueError(f"Invalid P-state: {pstate}. Must be between 0 and 15 inclusive.")
50 return int(pstate) + int(nvml.Pstates.PSTATE_0) 1e
53include "_clock.pxi"
54include "_cooler.pxi"
55include "_device_attributes.pxi"
56include "_device_utils.pxi"
57include "_event.pxi"
58include "_fan.pxi"
59include "_field_values.pxi"
60include "_inforom.pxi"
61include "_memory.pxi"
62include "_mig.pxi"
63include "_nvlink.pxi"
64include "_pci_info.pxi"
65include "_performance.pxi"
66include "_process.pxi"
67include "_repair_status.pxi"
68include "_temperature.pxi"
69include "_utilization.pxi"
72_ADDRESSING_MODE_MAPPING = {
73 nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_HMM: AddressingMode.HMM,
74 nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_ATS: AddressingMode.ATS,
75}
78_AFFINITY_SCOPE_MAPPING = {
79 AffinityScope.NODE: nvml.AffinityScope.NODE,
80 AffinityScope.SOCKET: nvml.AffinityScope.SOCKET,
81}
84_BRAND_TYPE_MAPPING = {
85 nvml.BrandType.BRAND_UNKNOWN: "Unknown",
86 nvml.BrandType.BRAND_QUADRO: "Quadro",
87 nvml.BrandType.BRAND_TESLA: "Tesla",
88 nvml.BrandType.BRAND_NVS: "NVS",
89 nvml.BrandType.BRAND_GRID: "GRID",
90 nvml.BrandType.BRAND_GEFORCE: "GeForce",
91 nvml.BrandType.BRAND_TITAN: "Titan",
92 nvml.BrandType.BRAND_NVIDIA_VAPPS: "NVIDIA vApps",
93 nvml.BrandType.BRAND_NVIDIA_VPC: "NVIDIA VPC",
94 nvml.BrandType.BRAND_NVIDIA_VCS: "NVIDIA VCS",
95 nvml.BrandType.BRAND_NVIDIA_VWS: "NVIDIA VWS",
96 nvml.BrandType.BRAND_NVIDIA_CLOUD_GAMING: "NVIDIA Cloud Gaming",
97 nvml.BrandType.BRAND_NVIDIA_VGAMING: "NVIDIA vGaming",
98 nvml.BrandType.BRAND_QUADRO_RTX: "Quadro RTX",
99 nvml.BrandType.BRAND_NVIDIA_RTX: "NVIDIA RTX",
100 nvml.BrandType.BRAND_NVIDIA: "NVIDIA",
101 nvml.BrandType.BRAND_GEFORCE_RTX: "GeForce RTX",
102 nvml.BrandType.BRAND_TITAN_RTX: "Titan RTX",
103}
106_GPU_P2P_CAPS_INDEX_MAPPING = {
107 GpuP2PCapsIndex.READ: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_READ,
108 GpuP2PCapsIndex.WRITE: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_WRITE,
109 GpuP2PCapsIndex.NVLINK: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_NVLINK,
110 GpuP2PCapsIndex.ATOMICS: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_ATOMICS,
111 GpuP2PCapsIndex.PCI: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PCI,
112 GpuP2PCapsIndex.PROP: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PROP,
113 GpuP2PCapsIndex.UNKNOWN: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_UNKNOWN,
114}
117_GPU_P2P_STATUS_MAPPING = {
118 nvml.GpuP2PStatus.P2P_STATUS_OK: GpuP2PStatus.OK,
119 nvml.GpuP2PStatus.P2P_STATUS_CHIPSET_NOT_SUPPORTED: GpuP2PStatus.CHIPSET_NOT_SUPPORTED,
120 nvml.GpuP2PStatus.P2P_STATUS_GPU_NOT_SUPPORTED: GpuP2PStatus.GPU_NOT_SUPPORTED,
121 nvml.GpuP2PStatus.P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED: GpuP2PStatus.IOH_TOPOLOGY_NOT_SUPPORTED,
122 nvml.GpuP2PStatus.P2P_STATUS_DISABLED_BY_REGKEY: GpuP2PStatus.DISABLED_BY_REGKEY,
123 nvml.GpuP2PStatus.P2P_STATUS_NOT_SUPPORTED: GpuP2PStatus.NOT_SUPPORTED,
124 nvml.GpuP2PStatus.P2P_STATUS_UNKNOWN: GpuP2PStatus.UNKNOWN,
125}
128_GPU_TOPOLOGY_LEVEL_MAPPING = {
129 GpuTopologyLevel.INTERNAL: nvml.GpuTopologyLevel.TOPOLOGY_INTERNAL,
130 GpuTopologyLevel.SINGLE: nvml.GpuTopologyLevel.TOPOLOGY_SINGLE,
131 GpuTopologyLevel.MULTIPLE: nvml.GpuTopologyLevel.TOPOLOGY_MULTIPLE,
132 GpuTopologyLevel.HOSTBRIDGE: nvml.GpuTopologyLevel.TOPOLOGY_HOSTBRIDGE,
133 GpuTopologyLevel.NODE: nvml.GpuTopologyLevel.TOPOLOGY_NODE,
134 GpuTopologyLevel.SYSTEM: nvml.GpuTopologyLevel.TOPOLOGY_SYSTEM,
135}
138_GPU_TOPOLOGY_LEVEL_INV_MAPPING = {v: k for k, v in _GPU_TOPOLOGY_LEVEL_MAPPING.items()}
142cdef class Device:
143 """
144 Representation of a device.
146 :class:`cuda.core.system.Device` provides access to various pieces of metadata
147 about devices and their topology, as provided by the NVIDIA Management
148 Library (NVML). To use CUDA with a device, use :class:`cuda.core.Device`.
150 Creating a device instance causes NVML to initialize the target GPU.
151 NVML may initialize additional GPUs if the target GPU is an SLI slave.
153 Parameters
154 ----------
155 index: int, optional
156 Integer representing the CUDA device index to get a handle to. Valid
157 values are between ``0`` and ``cuda.core.system.get_num_devices() - 1``.
159 The order in which devices are enumerated has no guarantees of
160 consistency between reboots. For that reason, it is recommended that
161 devices are looked up by their PCI ids or UUID.
163 uuid: bytes or str, optional
164 UUID of a CUDA device to get a handle to.
166 pci_bus_id: bytes or str, optional
167 PCI bus ID of a CUDA device to get a handle to.
169 Raises
170 ------
171 ValueError
172 If anything other than a single `index`, `uuid` or `pci_bus_id` are specified.
173 """
175 # This is made public for testing purposes only
176 cdef public intptr_t _handle
178 def __init__(
179 self,
180 *,
181 index: int | None = None,
182 uuid: bytes | str | None = None,
183 pci_bus_id: bytes | str | None = None,
184 ):
185 args = [index, uuid, pci_bus_id] 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp
186 cdef int arg_count = sum(arg is not None for arg in args) 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp
188 if arg_count > 1: 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp
189 raise ValueError("Handle requires only one of `index`, `uuid`, or `pci_bus_id`.")
190 if arg_count == 0: 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp
191 raise ValueError("Handle requires either a device `index`, `uuid`, or `pci_bus_id`.")
193 initialize() 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp
195 if index is not None: 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp
196 self._handle = nvml.device_get_handle_by_index_v2(index) 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ
197 elif uuid is not None: 1fp
198 if isinstance(uuid, bytes): 1p
199 uuid = uuid.decode("ascii")
200 self._handle = nvml.device_get_handle_by_uuid(uuid) 1p
201 elif pci_bus_id is not None: 1f
202 if isinstance(pci_bus_id, bytes): 1f
203 pci_bus_id = pci_bus_id.decode("ascii")
204 self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) 1f
206 #########################################################################
207 # BASIC PROPERTIES
209 @property
210 def index(self) -> int:
211 """
212 The NVML index of this device.
214 Valid indices are derived from the count returned by
215 :meth:`Device.get_device_count`. For example, if ``get_device_count()``
216 returns 2, the valid indices are 0 and 1, corresponding to GPU 0 and GPU
217 1.
219 The order in which NVML enumerates devices has no guarantees of
220 consistency between reboots. For that reason, it is recommended that
221 devices be looked up by their PCI ids or GPU UUID.
223 Note: The NVML index may not correlate with other APIs, such as the CUDA
224 device index.
225 """
226 return nvml.device_get_index(self._handle) 1fI
228 @property
229 def uuid(self) -> str:
230 """
231 Retrieves the globally unique immutable UUID associated with this
232 device, as a 5 part hexadecimal string, that augments the immutable,
233 board serial identifier.
235 In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-``
236 prefix. If you need a `uuid` without that prefix (for example, to
237 interact with CUDA), use the `uuid_without_prefix` property.
238 """
239 return nvml.device_get_uuid(self._handle) 1P
241 @property
242 def uuid_without_prefix(self) -> str:
243 """
244 Retrieves the globally unique immutable UUID associated with this
245 device, as a 5 part hexadecimal string, that augments the immutable,
246 board serial identifier.
248 In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-``
249 prefix. This property returns it without the prefix, to match the UUIDs
250 used in CUDA. If you need the prefix, use the `uuid` property.
251 """
252 # NVML UUIDs have a `gpu-` or `mig-` prefix. We remove that here.
253 return nvml.device_get_uuid(self._handle)[4:] 1Ehp
255 @property
256 def pci_bus_id(self) -> str:
257 """
258 Retrieves the PCI bus ID of this device.
259 """
260 return self.pci_info.bus_id
262 @property
263 def numa_node_id(self) -> int:
264 """
265 The NUMA node of the given GPU device.
267 This only applies to platforms where the GPUs are NUMA nodes.
268 """
269 return nvml.device_get_numa_node_id(self._handle) 1L
271 @property
272 def arch(self) -> DeviceArch:
273 """
274 :obj:`~DeviceArch` device architecture.
276 For example, a Tesla V100 will report ``DeviceArchitecture.name ==
277 "VOLTA"``, and RTX A6000 will report ``DeviceArchitecture.name ==
278 "AMPERE"``.
279 """
280 arch = nvml.device_get_architecture(self._handle) 1ln
281 try: 1ln
282 return DeviceArch(arch) 1ln
283 except ValueError:
284 return DeviceArch.UNKNOWN
286 @property
287 def name(self) -> str:
288 """
289 Name of the device, e.g.: `"Tesla V100-SXM2-32GB"`
290 """
291 return nvml.device_get_name(self._handle) 1C
293 @property
294 def brand(self) -> str:
295 """
296 The brand of the device.
298 Returns "Unknown" if the brand is unknown.
299 """
300 return _BRAND_TYPE_MAPPING.get(nvml.device_get_brand(self._handle), "Unknown") 1z
302 @property
303 def serial(self) -> str:
304 """
305 Retrieves the globally unique board serial number associated with this
306 device's board.
308 For all products with an InfoROM.
309 """
310 return nvml.device_get_serial(self._handle) 1D
312 @property
313 def module_id(self) -> int:
314 """
315 Get a unique identifier for the device module on the baseboard.
317 This API retrieves a unique identifier for each GPU module that exists
318 on a given baseboard. For non-baseboard products, this ID would always
319 be 0.
320 """
321 return nvml.device_get_module_id(self._handle) 1K
323 @property
324 def minor_number(self) -> int:
325 """
326 The minor number of this device.
328 For Linux only.
330 The minor number is used by the Linux device driver to identify the
331 device node in ``/dev/nvidiaX``.
332 """
333 return nvml.device_get_minor_number(self._handle) 1H
335 @property
336 def is_c2c_enabled(self) -> bool:
337 """
338 Whether the C2C (Chip-to-Chip) mode is enabled for this device.
339 """
340 return bool(nvml.device_get_c2c_mode_info_v(self._handle).is_c2c_enabled) 1v
342 @property
343 def is_persistence_mode_enabled(self) -> bool:
344 """
345 Whether persistence mode is enabled for this device.
347 For Linux only.
348 """
349 return nvml.device_get_persistence_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED 1k
351 @is_persistence_mode_enabled.setter
352 def is_persistence_mode_enabled(self, enabled: bool) -> None:
353 nvml.device_set_persistence_mode( 1k
354 self._handle, 1k
355 nvml.EnableState.FEATURE_ENABLED if enabled else nvml.EnableState.FEATURE_DISABLED 1k
356 )
358 @property
359 def cuda_compute_capability(self) -> tuple[int, int]:
360 """
361 CUDA compute capability of the device, e.g.: `(7, 0)` for a Tesla V100.
363 Returns a tuple `(major, minor)`.
364 """
365 return nvml.device_get_cuda_compute_capability(self._handle) 1A
367 def to_cuda_device(self) -> "cuda.core.Device":
368 """
369 Get the corresponding :class:`cuda.core.Device` (which is used for CUDA
370 access) for this :class:`cuda.core.system.Device` (which is used for
371 NVIDIA machine library (NVML) access).
373 The devices are mapped to one another by their UUID.
375 Returns
376 -------
377 cuda.core.Device
378 The corresponding CUDA device.
380 Raises
381 ------
382 RuntimeError
383 No corresponding CUDA device is found for this NVML device.
385 For example, on a MIG system, the physical GPU will not have an
386 available CUDA device, since it can not be used directly, even
387 though it can be enumerated from NVML.
388 """
389 from cuda.core import Device as CudaDevice 1h
391 # CUDA does not have an API to get a device by its UUID, so we just
392 # search all the devices for one with a matching UUID.
394 for cuda_device in CudaDevice.get_all_devices(): 1h
395 if cuda_device.uuid == self.uuid_without_prefix: 1h
396 return cuda_device 1h
398 raise RuntimeError("No corresponding CUDA device found for this NVML device.")
400 @classmethod
401 def get_device_count(cls) -> int:
402 """
403 Get the number of available devices.
405 Returns
406 -------
407 int
408 The number of available devices.
409 """
410 initialize() 1STU
412 return nvml.device_get_count_v2() 1STU
414 @classmethod
415 def get_all_devices(cls) -> Iterable[Device]:
416 """
417 Query the available device instances.
419 Returns
420 -------
421 Iterator over :obj:`~Device`
422 An iterator over available devices.
423 """
424 initialize() 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ
426 for device_id in range(nvml.device_get_count_v2()): 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ
427 yield cls(index=device_id) 1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ
429 #########################################################################
430 # ADDRESSING MODE
432 @property
433 def addressing_mode(self) -> AddressingMode | None:
434 """
435 Get the :obj:`~AddressingMode` of the device.
436 """
437 return _ADDRESSING_MODE_MAPPING.get(nvml.device_get_addressing_mode(self._handle).value, None) 1t
439 #########################################################################
440 # MIG (MULTI-INSTANCE GPU) DEVICES
442 @property
443 def mig(self) -> MigInfo:
444 """
445 Get :obj:`~MigInfo` accessor for MIG (Multi-Instance GPU) information.
447 For Ampere™ or newer fully supported devices.
448 """
449 return MigInfo(self) 1qJ
451 #########################################################################
452 # AFFINITY
454 @classmethod
455 def get_all_devices_with_cpu_affinity(cls, cpu_index: int) -> Iterable[Device]:
456 """
457 Retrieve the set of GPUs that have a CPU affinity with the given CPU number.
459 Supported on Linux only.
461 Parameters
462 ----------
463 cpu_index: int
464 The CPU index.
466 Returns
467 -------
468 Iterator of :obj:`~Device`
469 An iterator over available devices.
470 """
471 cdef Device device
472 for handle in nvml.system_get_topology_gpu_set(cpu_index): 1R
473 device = Device.__new__(Device) 1R
474 device._handle = handle 1R
475 yield device 1R
477 def get_memory_affinity(self, scope: AffinityScope | str=AffinityScope.NODE) -> list[int]:
478 """
479 Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal
480 memory affinity for the device.
482 For Kepler™ or newer fully supported devices.
484 Supported on Linux only.
486 If requested scope is not applicable to the target topology, the API
487 will fall back to reporting the memory affinity for the immediate non-I/O
488 ancestor of the device.
490 Parameters
491 ----------
492 scope: AffinityScope | str, optional
493 The scope of the affinity query. Must be one of the values of
494 :class:`AffinityScope`. Default is :attr:`AffinityScope.NODE`.
496 Returns
497 -------
498 list[int]
499 A list of indices of NUMA nodes or CPU sockets with the ideal memory
500 affinity for the device.
501 """
502 try: 1b
503 scope = _AFFINITY_SCOPE_MAPPING[scope] 1b
504 except KeyError:
505 raise ValueError(
506 f"Invalid affinity scope: {scope}. "
507 f"Must be one of {list(AffinityScope.__members__.values())}"
508 ) from None
509 return _unpack_bitmask( 1b
510 nvml.device_get_memory_affinity( 1b
511 self._handle, 1b
512 <unsigned int>ceil(cpu_count() / 64), 1b
513 scope, 1b
514 )
515 )
517 def get_cpu_affinity(self, scope: AffinityScope | str=AffinityScope.NODE) -> list[int]:
518 """
519 Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal
520 CPU affinity for the device.
522 For Kepler™ or newer fully supported devices.
524 Supported on Linux only.
526 If requested scope is not applicable to the target topology, the API
527 will fall back to reporting the memory affinity for the immediate non-I/O
528 ancestor of the device.
530 Parameters
531 ----------
532 scope: AffinityScope | str, optional
533 The scope of the affinity query. Must be one of the values of
534 :class:`AffinityScope`. Default is :attr:`AffinityScope.NODE`.
536 Returns
537 -------
538 list[int]
539 A list of indices of NUMA nodes or CPU sockets with the ideal memory
540 affinity for the device.
541 """
542 try: 1biR
543 scope = _AFFINITY_SCOPE_MAPPING[scope] 1biR
544 except KeyError:
545 raise ValueError(
546 f"Invalid affinity scope: {scope}. "
547 f"Must be one of {list(AffinityScope.__members__.values())}"
548 ) from None
549 return _unpack_bitmask( 1biR
550 nvml.device_get_cpu_affinity_within_scope( 1biR
551 self._handle, 1biR
552 <unsigned int>ceil(cpu_count() / 64), 1biR
553 scope, 1biR
554 )
555 )
557 def set_cpu_affinity(self):
558 """
559 Sets the ideal affinity for the calling thread and device.
561 For Kepler™ or newer fully supported devices.
563 Supported on Linux only.
564 """
565 nvml.device_set_cpu_affinity(self._handle)
567 def clear_cpu_affinity(self):
568 """
569 Clear all affinity bindings for the calling thread.
571 For Kepler™ or newer fully supported devices.
573 Supported on Linux only.
574 """
575 nvml.device_clear_cpu_affinity(self._handle)
577 #########################################################################
578 # CLOCK
579 # See external class definitions in _clock.pxi
581 def get_clock(self, clock_type: ClockType | str) -> ClockInfo:
582 """
583 :obj:`~_device.ClockInfo` object to get information about and manage a specific clock on a device.
584 """
585 return ClockInfo(self._handle, clock_type) 1e
587 @property
588 def is_auto_boosted_clocks_enabled(self) -> tuple[bool, bool]:
589 """
590 Retrieve the current state of auto boosted clocks on a device.
592 For Kepler™ or newer fully supported devices.
594 Auto Boosted clocks are enabled by default on some hardware, allowing
595 the GPU to run at higher clock rates to maximize performance as thermal
596 limits allow.
598 On Pascal™ and newer hardware, Auto Boosted clocks are controlled
599 through application clocks. Use :meth:`set_application_clocks` and
600 :meth:`reset_application_clocks` to control Auto Boost behavior.
602 Returns
603 -------
604 bool
605 The current state of Auto Boosted clocks
606 bool
607 The default Auto Boosted clocks behavior
609 """
610 current, default = nvml.device_get_auto_boosted_clocks_enabled(self._handle) 1u
611 return current == nvml.EnableState.FEATURE_ENABLED, default == nvml.EnableState.FEATURE_ENABLED
613 @property
614 def current_clock_event_reasons(self) -> list[ClocksEventReasons]:
615 """
616 Retrieves the current :obj:`~ClocksEventReasons`.
618 For all fully supported products.
619 """
620 cdef uint64_t[1] reasons
621 reasons[0] = nvml.device_get_current_clocks_event_reasons(self._handle) 1d
622 output_reasons = [] 1d
623 for reason in _unpack_bitmask(reasons): 1d
624 try:
625 output_reason = _CLOCKS_EVENT_REASONS_MAPPING[1 << reason]
626 except KeyError:
627 raise ValueError(f"Unknown clock event reason bit: {1 << reason}")
628 output_reasons.append(output_reason)
629 return output_reasons 1d
631 @property
632 def supported_clock_event_reasons(self) -> list[ClocksEventReasons]:
633 """
634 Retrieves supported :obj:`~ClocksEventReasons` that can be returned by
635 :meth:`get_current_clock_event_reasons`.
637 For all fully supported products.
639 This method is not supported in virtual machines running virtual GPU (vGPU).
640 """
641 cdef uint64_t[1] reasons
642 reasons[0] = nvml.device_get_supported_clocks_event_reasons(self._handle) 1d
643 output_reasons = [] 1d
644 for reason in _unpack_bitmask(reasons): 1d
645 try: 1d
646 output_reason = _CLOCKS_EVENT_REASONS_MAPPING[1 << reason] 1d
647 except KeyError:
648 raise ValueError(f"Unknown clock event reason bit: {1 << reason}")
649 output_reasons.append(output_reason) 1d
650 return output_reasons 1d
652 ##########################################################################
653 # COOLER
654 # See external class definitions in _cooler.pxi
656 @property
657 def cooler(self) -> CoolerInfo:
658 """
659 :obj:`~_device.CoolerInfo` object with cooler information for the device.
660 """
661 return CoolerInfo(nvml.device_get_cooler_info(self._handle))
663 ##########################################################################
664 # DEVICE ATTRIBUTES
665 # See external class definitions in _device_attributes.pxi
667 @property
668 def attributes(self) -> DeviceAttributes:
669 """
670 :obj:`~_device.DeviceAttributes` object with various device attributes.
672 For Ampere™ or newer fully supported devices. Only available on Linux
673 systems.
674 """
675 return DeviceAttributes(nvml.device_get_attributes_v2(self._handle)) 1x
677 #########################################################################
678 # DISPLAY
680 @property
681 def is_display_connected(self) -> bool:
682 """
683 The display mode for this device.
685 Indicates whether a physical display (e.g. monitor) is currently connected to
686 any of the device's connectors.
687 """
688 return nvml.device_get_display_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED 1r
690 @property
691 def is_display_active(self) -> bool:
692 """
693 The display active status for this device.
695 Indicates whether a display is initialized on the device. For example,
696 whether X Server is attached to this device and has allocated memory for
697 the screen.
699 Display can be active even when no monitor is physically attached.
700 """
701 return nvml.device_get_display_active(self._handle) == nvml.EnableState.FEATURE_ENABLED 1r
703 ##########################################################################
704 # EVENTS
705 # See external class definitions in _event.pxi
707 def register_events(self, events: EventType | str | list[EventType | str]) -> DeviceEvents:
708 """
709 Starts recording events on this device.
711 For Fermi™ or newer fully supported devices. For Linux only.
713 ECC events are available only on ECC-enabled devices (see
714 :meth:`Device.get_total_ecc_errors`). Power capping events are
715 available only on Power Management enabled devices (see
716 :meth:`Device.get_power_management_mode`).
718 This call starts recording of events on specific device. All events
719 that occurred before this call are not recorded. Wait for events using
720 the :meth:`DeviceEvents.wait` method on the result.
722 Examples
723 --------
724 >>> device = Device(index=0)
725 >>> events = device.register_events([
726 ... EventType.XID_CRITICAL_ERROR,
727 ... ])
728 >>> while event := events.wait(timeout_ms=10000):
729 ... print(f"Event {event.event_type} occurred on device {event.device.uuid}")
731 Parameters
732 ----------
733 events: EventType, str, or list of EventType or str
734 The event type or list of event types to register for this device.
736 Returns
737 -------
738 :obj:`~_device.DeviceEvents`
739 An object representing the registered events. Call
740 :meth:`~_device.DeviceEvents.wait` on this object to wait for events.
742 Raises
743 ------
744 :class:`cuda.core.system.NotSupportedError`
745 None of the requested event types are registered.
746 """
747 return DeviceEvents(self._handle, events) 1g
749 def get_supported_event_types(self) -> list[EventType]:
750 """
751 Get the list of event types supported by this device.
753 For Fermi™ or newer fully supported devices. For Linux only (returns an
754 empty list on Windows).
756 Returns
757 -------
758 list[EventType]
759 The list of supported event types.
760 """
761 cdef uint64_t[1] bitmask
762 bitmask[0] = nvml.device_get_supported_event_types(self._handle) 1g
763 events = [] 1g
764 for ev in _unpack_bitmask(bitmask): 1g
765 try: 1g
766 ev_enum = _EVENT_TYPE_MAPPING[1 << ev] 1g
767 except KeyError:
768 raise ValueError(f"Unknown event type bit: {1 << ev}")
769 events.append(ev_enum) 1g
770 return events 1g
772 ##########################################################################
773 # FAN
774 # See external class definitions in _fan.pxi
776 def get_fan(self, fan: int = 0) -> FanInfo:
777 """
778 :obj:`~_device.FanInfo` object to get information and manage a specific fan on a device.
779 """
780 if fan < 0 or fan >= self.num_fans:
781 raise ValueError(f"Fan index {fan} is out of range [0, {self.num_fans})")
782 return FanInfo(self._handle, fan)
784 @property
785 def num_fans(self) -> int:
786 """
787 The number of fans on the device.
788 """
789 return nvml.device_get_num_fans(self._handle) 1wF
791 ##########################################################################
792 # FIELD VALUES
793 # See external class definitions in _field_values.pxi
795 def get_field_values(self, field_ids: list[int | tuple[int, int]]) -> FieldValues:
796 """
797 Get multiple field values from the device.
799 Each value specified can raise its own exception. That exception will
800 be raised when attempting to access the corresponding ``value`` from the
801 returned :obj:`~_device.FieldValues` container.
803 To confirm that there are no exceptions in the entire container, call
804 :meth:`~_device.FieldValues.validate`.
806 Parameters
807 ----------
808 field_ids: list[int | tuple[int, int]]
809 List of field IDs to query.
811 Each item may be either a single value from the :class:`FieldId`
812 enum, or a pair of (:class:`FieldId`, scope ID).
814 Returns
815 -------
816 :obj:`~_device.FieldValues`
817 Container of field values corresponding to the requested field IDs.
818 """
819 # Passing a field_ids array of length 0 raises an InvalidArgumentError,
820 # so avoid that.
821 if len(field_ids) == 0: 1j
822 return FieldValues(nvml.FieldValue(0)) 1j
824 return FieldValues(nvml.device_get_field_values(self._handle, field_ids)) 1j
826 def clear_field_values(self, field_ids: list[int | tuple[int, int]]) -> None:
827 """
828 Clear multiple field values from the device.
830 Parameters
831 ----------
832 field_ids: list[int | tuple[int, int]]
833 List of field IDs to clear.
835 Each item may be either a single value from the :class:`FieldId`
836 enum, or a pair of (:class:`FieldId`, scope ID).
837 """
838 # Passing a field_ids array of length 0 raises an InvalidArgumentError,
839 # so avoid that.
840 if len(field_ids) == 0: 1j
841 return
843 nvml.device_clear_field_values(self._handle, field_ids) 1j
845 ##########################################################################
846 # INFOROM
847 # See external class definitions in _inforom.pxi
849 @property
850 def inforom(self) -> InforomInfo:
851 """
852 :obj:`~_device.InforomInfo` object with InfoROM information.
854 For all products with an InfoROM.
855 """
856 return InforomInfo(self) 1G
858 ##########################################################################
859 # MEMORY
860 # See external class definitions in _memory.pxi
862 @property
863 def bar1_memory_info(self) -> BAR1MemoryInfo:
864 """
865 :obj:`~_device.BAR1MemoryInfo` object with BAR1 memory information.
867 BAR1 is used to map the FB (device memory) so that it can be directly
868 accessed by the CPU or by 3rd party devices (peer-to-peer on the PCIE
869 bus).
870 """
871 return BAR1MemoryInfo(nvml.device_get_bar1_memory_info(self._handle)) 1y
873 @property
874 def memory_info(self) -> MemoryInfo:
875 """
876 :obj:`~_device.MemoryInfo` object with memory information.
877 """
878 return MemoryInfo(nvml.device_get_memory_info_v2(self._handle)) 1B
880 ##########################################################################
881 # NVLINK
882 # See external class definitions in _nvlink.pxi
884 def get_nvlink(self, link: int) -> NvlinkInfo:
885 """
886 Get :obj:`~NvlinkInfo` about this device.
888 For devices with NVLink support.
889 """
890 if link < 0 or link >= NvlinkInfo.max_links: 1s
891 raise ValueError(f"Link index {link} is out of range [0, {NvlinkInfo.max_links})")
892 return NvlinkInfo(self, link) 1s
894 ##########################################################################
895 # PCI INFO
896 # See external class definitions in _pci_info.pxi
898 @property
899 def pci_info(self) -> PciInfo:
900 """
901 :obj:`~_device.PciInfo` object with the PCI attributes of this device.
903 Non-physical devices, such as MIG devices, may not have PCI attributes.
904 In that case, this property will raise a `RuntimeError`.
905 """
906 try: 1fmhp
907 pci_info = nvml.device_get_pci_info_ext(self._handle) 1fmhp
908 except nvml.InvalidArgumentError:
909 raise RuntimeError("This device does not have PCI attributes") from None
910 else:
911 return PciInfo(pci_info, self._handle) 1fmhp
913 ##########################################################################
914 # PERFORMANCE
915 # See external class definitions in _performance.pxi
917 @property
918 def performance_state(self) -> int | None:
919 """
920 The current performance state of the device.
922 For Fermi™ or newer fully supported devices.
924 Returns
925 -------
926 int | None
927 The current performance state of the device, as an integer between 0 and 15,
928 where 0 is maximum performance and higher numbers are lower performance.
929 Returns `None` if the performance state is unknown.
930 """
931 return _pstate_to_int(nvml.device_get_performance_state(self._handle)) 1ec
933 @property
934 def dynamic_pstates_info(self) -> GpuDynamicPstatesInfo:
935 """
936 :obj:`~_device.GpuDynamicPstatesInfo` object with performance monitor samples from the associated subdevice.
937 """
938 return GpuDynamicPstatesInfo(nvml.device_get_dynamic_pstates_info(self._handle)) 1c
940 @property
941 def supported_pstates(self) -> list[int]:
942 """
943 Get all supported Performance States (P-States) for the device.
945 The returned list contains a contiguous list of valid P-States supported by
946 the device.
948 Return
949 ------
950 list[int]
951 A list of supported performance state of the device, as an integer
952 between 0 and 15, where 0 is maximum performance and higher numbers
953 are lower performance.
954 """
955 # From nvml.h:
956 # The returned array would contain a contiguous list of valid P-States
957 # supported by the device. If the number of supported P-States is fewer
958 # than the size of the array supplied missing elements would contain \a
959 # NVML_PSTATE_UNKNOWN.
961 pstates = [] 1c
962 for pstate in nvml.device_get_supported_performance_states(self._handle): 1c
963 pstate_value = _pstate_to_int(pstate) 1c
964 if pstate_value is not None: 1c
965 pstates.append(pstate_value) 1c
966 return pstates 1c
968 ##########################################################################
969 # PROCESS
970 # See external class definitions in _process.pxi
972 @property
973 def compute_running_processes(self) -> list[ProcessInfo]:
974 """
975 Get information about processes with a compute context on a device
977 For Fermi™ or newer fully supported devices.
979 This function returns information only about compute running processes
980 (e.g. CUDA application which have active context). Any graphics
981 applications (e.g. using OpenGL, DirectX) won't be listed by this
982 function.
984 Keep in mind that information returned by this call is dynamic and the
985 number of elements might change in time.
987 In MIG mode, if device handle is provided, the API returns aggregate
988 information, only if the caller has appropriate privileges. Per-instance
989 information can be queried by using specific MIG device handles.
990 Querying per-instance information using MIG device handles is not
991 supported if the device is in vGPU Host virtualization mode.
992 """
993 return [ProcessInfo(self, proc) for proc in nvml.device_get_compute_running_processes_v3(self._handle)] 1qQ
995 ##########################################################################
996 # REPAIR STATUS
997 # See external class definitions in _repair_status.pxi
999 @property
1000 def repair_status(self) -> RepairStatus:
1001 """
1002 :obj:`~_device.RepairStatus` object with TPC/Channel repair status.
1004 For Ampere™ or newer fully supported devices.
1005 """
1006 return RepairStatus(self._handle) 1M
1008 ##########################################################################
1009 # TEMPERATURE
1010 # See external class definitions in _temperature.pxi
1012 @property
1013 def temperature(self) -> Temperature:
1014 """
1015 :obj:`~_device.Temperature` object with temperature information for the device.
1016 """
1017 return Temperature(self._handle) 1N
1019 #######################################################################
1020 # TOPOLOGY
1022 def get_topology_nearest_gpus(self, level: GpuTopologyLevel | str) -> Iterable[Device]:
1023 """
1024 Retrieve the GPUs that are nearest to this device at a specific interconnectivity level.
1026 Supported on Linux only.
1028 Parameters
1029 ----------
1030 level: :class:`GpuTopologyLevel`
1031 The topology level.
1033 Returns
1034 -------
1035 Iterable of :class:`Device`
1036 The nearest devices at the given topology level.
1037 """
1038 cdef Device device
1039 try: 1o
1040 level = _GPU_TOPOLOGY_LEVEL_MAPPING[level] 1o
1041 except KeyError:
1042 raise ValueError(
1043 f"Invalid topology level: {level}. "
1044 f"Must be one of {list(GpuTopologyLevel.__members__.values())}"
1045 ) from None
1046 for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): 1o
1047 device = Device.__new__(Device)
1048 device._handle = handle
1049 yield device
1051 #######################################################################
1052 # UTILIZATION
1054 @property
1055 def utilization(self) -> Utilization:
1056 """
1057 Retrieves the current :obj:`~Utilization` rates for the device's major
1058 subsystems.
1060 For Fermi™ or newer fully supported devices.
1062 Note: During driver initialization when ECC is enabled one can see high
1063 GPU and Memory Utilization readings. This is caused by ECC Memory
1064 Scrubbing mechanism that is performed during driver initialization.
1066 Note: On MIG-enabled GPUs, querying device utilization rates is not
1067 currently supported.
1069 Returns
1070 -------
1071 Utilization
1072 An object containing the current utilization rates for the device.
1073 """
1074 return Utilization(nvml.device_get_utilization_rates(self._handle)) 1O
1077def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel:
1078 """
1079 Retrieve the common ancestor for two devices.
1081 For Linux only.
1083 Parameters
1084 ----------
1085 device1: :class:`Device`
1086 The first device.
1087 device2: :class:`Device`
1088 The second device.
1090 Returns
1091 -------
1092 :class:`GpuTopologyLevel`
1093 The common ancestor level of the two devices.
1094 """
1095 return _GPU_TOPOLOGY_LEVEL_INV_MAPPING[
1096 nvml.device_get_topology_common_ancestor(
1097 device1._handle,
1098 device2._handle,
1099 )
1100 ]
1103def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex | str) -> GpuP2PStatus:
1104 """
1105 Retrieve the P2P status between two devices.
1107 Parameters
1108 ----------
1109 device1: :class:`Device`
1110 The first device.
1111 device2: :class:`Device`
1112 The second device.
1113 index: :class:`GpuP2PCapsIndex` | str
1114 The P2P capability index being looked for between ``device1`` and ``device2``.
1116 Returns
1117 -------
1118 :class:`GpuP2PStatus`
1119 The P2P status between the two devices.
1120 """
1121 try:
1122 index_enum = _GPU_P2P_CAPS_INDEX_MAPPING[index]
1123 except KeyError:
1124 raise ValueError(
1125 f"Invalid P2P caps index: {index}. "
1126 f"Must be one of {list(GpuP2PCapsIndex.__members__.values())}"
1127 ) from None
1128 return _GPU_P2P_STATUS_MAPPING.get(
1129 nvml.device_get_p2p_status(
1130 device1._handle,
1131 device2._handle,
1132 index_enum,
1133 ),
1134 GpuP2PStatus.UNKNOWN
1135 )
1138__all__ = [
1139 "Device",
1140 "get_p2p_status",
1141 "get_topology_common_ancestor",
1142 "NvlinkInfo",
1143]