Coverage for cuda/core/system/

3# SPDX-License-Identifier: Apache-2.0

5from libc.stdint cimport intptr_t, uint64_t

6from libc.math cimport ceil

8from multiprocessing import cpu_count (empty)

9from typing import Iterable (empty)

10import warnings (empty)

12from cuda.bindings import nvml (empty)

14from ._nvml_context cimport initialize

15from cuda.core.system.typing import ( (empty)

16 AddressingMode,

17 AffinityScope,

18 DeviceArch,

19 ClockId,

20 ClocksEventReasons,

21 ClockType,

22 CoolerControl,

23 CoolerTarget,

24 DeviceArch,

25 EventType,

26 FanControlPolicy,

27 FieldId,

28 GpuP2PCapsIndex,

29 GpuP2PStatus,

30 GpuTopologyLevel,

31 InforomObject,

32 TemperatureThresholds,

33 ThermalController,

34 ThermalTarget,

35)

38cdef object _pstate_to_int(object pstate): (empty)

39 if pstate == nvml.Pstates.PSTATE_UNKNOWN: 3 ctx1aec

40 return None 1 ctx1c

41 assert ( 3 ctx1aec

42 int(pstate) >= 0 and int(pstate) <= 15 3 ctx1aec

43 ), f"Invalid P-state: {pstate}. Must be between 0 and 15 inclusive, or PSTATE_UNKNOWN." (empty)

44 return int(pstate) - int(nvml.Pstates.PSTATE_0) 3 ctx1aec

47cdef int _pstate_to_enum(int pstate): (empty)

48 if pstate < 0 or pstate > 15: 2 ctx1ae

49 raise ValueError(f"Invalid P-state: {pstate}. Must be between 0 and 15 inclusive.") (empty)

50 return int(pstate) + int(nvml.Pstates.PSTATE_0) 1 ctx1e

53include "_clock.pxi"

54include "_cooler.pxi"

55include "_device_attributes.pxi"

56include "_device_utils.pxi"

57include "_event.pxi"

58include "_fan.pxi"

59include "_field_values.pxi"

60include "_inforom.pxi"

61include "_memory.pxi"

62include "_mig.pxi"

63include "_nvlink.pxi"

64include "_pci_info.pxi"

65include "_performance.pxi"

66include "_process.pxi"

67include "_repair_status.pxi"

68include "_temperature.pxi"

69include "_utilization.pxi"

72_ADDRESSING_MODE_MAPPING = {

73 nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_HMM: AddressingMode.HMM, (empty)

74 nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_ATS: AddressingMode.ATS, (empty)

75}

78_AFFINITY_SCOPE_MAPPING = {

79 AffinityScope.NODE: nvml.AffinityScope.NODE, (empty)

80 AffinityScope.SOCKET: nvml.AffinityScope.SOCKET, (empty)

81}

84_BRAND_TYPE_MAPPING = {

85 nvml.BrandType.BRAND_UNKNOWN: "Unknown", (empty)

86 nvml.BrandType.BRAND_QUADRO: "Quadro", (empty)

87 nvml.BrandType.BRAND_TESLA: "Tesla", (empty)

88 nvml.BrandType.BRAND_NVS: "NVS", (empty)

89 nvml.BrandType.BRAND_GRID: "GRID", (empty)

90 nvml.BrandType.BRAND_GEFORCE: "GeForce", (empty)

91 nvml.BrandType.BRAND_TITAN: "Titan", (empty)

92 nvml.BrandType.BRAND_NVIDIA_VAPPS: "NVIDIA vApps", (empty)

93 nvml.BrandType.BRAND_NVIDIA_VPC: "NVIDIA VPC", (empty)

94 nvml.BrandType.BRAND_NVIDIA_VCS: "NVIDIA VCS", (empty)

95 nvml.BrandType.BRAND_NVIDIA_VWS: "NVIDIA VWS", (empty)

96 nvml.BrandType.BRAND_NVIDIA_CLOUD_GAMING: "NVIDIA Cloud Gaming", (empty)

97 nvml.BrandType.BRAND_NVIDIA_VGAMING: "NVIDIA vGaming", (empty)

98 nvml.BrandType.BRAND_QUADRO_RTX: "Quadro RTX", (empty)

99 nvml.BrandType.BRAND_NVIDIA_RTX: "NVIDIA RTX", (empty)

100 nvml.BrandType.BRAND_NVIDIA: "NVIDIA", (empty)

101 nvml.BrandType.BRAND_GEFORCE_RTX: "GeForce RTX", (empty)

102 nvml.BrandType.BRAND_TITAN_RTX: "Titan RTX", (empty)

103}

104

105

106_GPU_P2P_CAPS_INDEX_MAPPING = {

107 GpuP2PCapsIndex.READ: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_READ, (empty)

108 GpuP2PCapsIndex.WRITE: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_WRITE, (empty)

109 GpuP2PCapsIndex.NVLINK: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_NVLINK, (empty)

110 GpuP2PCapsIndex.ATOMICS: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_ATOMICS, (empty)

111 GpuP2PCapsIndex.PCI: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PCI, (empty)

112 GpuP2PCapsIndex.PROP: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PROP, (empty)

113 GpuP2PCapsIndex.UNKNOWN: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_UNKNOWN, (empty)

114}

115

116

117_GPU_P2P_STATUS_MAPPING = {

118 nvml.GpuP2PStatus.P2P_STATUS_OK: GpuP2PStatus.OK, (empty)

119 nvml.GpuP2PStatus.P2P_STATUS_CHIPSET_NOT_SUPPORTED: GpuP2PStatus.CHIPSET_NOT_SUPPORTED, (empty)

120 nvml.GpuP2PStatus.P2P_STATUS_GPU_NOT_SUPPORTED: GpuP2PStatus.GPU_NOT_SUPPORTED, (empty)

121 nvml.GpuP2PStatus.P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED: GpuP2PStatus.IOH_TOPOLOGY_NOT_SUPPORTED, (empty)

122 nvml.GpuP2PStatus.P2P_STATUS_DISABLED_BY_REGKEY: GpuP2PStatus.DISABLED_BY_REGKEY, (empty)

123 nvml.GpuP2PStatus.P2P_STATUS_NOT_SUPPORTED: GpuP2PStatus.NOT_SUPPORTED, (empty)

124 nvml.GpuP2PStatus.P2P_STATUS_UNKNOWN: GpuP2PStatus.UNKNOWN, (empty)

125}

126

127

128_GPU_TOPOLOGY_LEVEL_MAPPING = {

129 GpuTopologyLevel.INTERNAL: nvml.GpuTopologyLevel.TOPOLOGY_INTERNAL, (empty)

130 GpuTopologyLevel.SINGLE: nvml.GpuTopologyLevel.TOPOLOGY_SINGLE, (empty)

131 GpuTopologyLevel.MULTIPLE: nvml.GpuTopologyLevel.TOPOLOGY_MULTIPLE, (empty)

132 GpuTopologyLevel.HOSTBRIDGE: nvml.GpuTopologyLevel.TOPOLOGY_HOSTBRIDGE, (empty)

133 GpuTopologyLevel.NODE: nvml.GpuTopologyLevel.TOPOLOGY_NODE, (empty)

134 GpuTopologyLevel.SYSTEM: nvml.GpuTopologyLevel.TOPOLOGY_SYSTEM, (empty)

135}

136

137

138_GPU_TOPOLOGY_LEVEL_INV_MAPPING = {v: k for k, v in _GPU_TOPOLOGY_LEVEL_MAPPING.items()} (empty)

139

140

141

142cdef class Device:

143 """

144 Representation of a device.

145

146 :class:`cuda.core.system.Device` provides access to various pieces of metadata

147 about devices and their topology, as provided by the NVIDIA Management

148 Library (NVML). To use CUDA with a device, use :class:`cuda.core.Device`.

149

150 Creating a device instance causes NVML to initialize the target GPU.

151 NVML may initialize additional GPUs if the target GPU is an SLI slave.

152

153 Parameters

154 ----------

155 index: int, optional

156 Integer representing the CUDA device index to get a handle to. Valid

157 values are between ``0`` and ``cuda.core.system.get_num_devices() - 1``.

158

159 The order in which devices are enumerated has no guarantees of

160 consistency between reboots. For that reason, it is recommended that

161 devices are looked up by their PCI ids or UUID.

162

163 uuid: bytes or str, optional

164 UUID of a CUDA device to get a handle to.

165

166 pci_bus_id: bytes or str, optional

167 PCI bus ID of a CUDA device to get a handle to.

168

169 Raises

170 ------

171 ValueError

172 If anything other than a single `index`, `uuid` or `pci_bus_id` are specified.

173 """

174

175 # This is made public for testing purposes only

176 cdef public intptr_t _handle

177

178 def __init__(

179 self,

180 *,

181 index: int | None = None,

182 uuid: bytes | str | None = None,

183 pci_bus_id: bytes | str | None = None,

184 ):

185 args = [index, uuid, pci_bus_id] 42 ctx1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

186 cdef int arg_count = sum(arg is not None for arg in args) 42 ctx1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

187

188 if arg_count > 1: 42 ctx1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

189 raise ValueError("Handle requires only one of `index`, `uuid`, or `pci_bus_id`.")

190 if arg_count == 0: 42 ctx1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

191 raise ValueError("Handle requires either a device `index`, `uuid`, or `pci_bus_id`.")

192

193 initialize() 42 ctx1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

194

195 if index is not None: 42 ctx1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQp

196 self._handle = nvml.device_get_handle_by_index_v2(index) 41 ctx1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ

197 elif uuid is not None: 2 ctx1fp

198 if isinstance(uuid, bytes): 1 ctx1p

199 uuid = uuid.decode("ascii")

200 self._handle = nvml.device_get_handle_by_uuid(uuid) 1 ctx1p

201 elif pci_bus_id is not None: 1 ctx1f

202 if isinstance(pci_bus_id, bytes): 1 ctx1f

203 pci_bus_id = pci_bus_id.decode("ascii")

204 self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) 1 ctx1f

205

206 #########################################################################

207 # BASIC PROPERTIES

208

209 @property

210 def index(self) -> int:

211 """

212 The NVML index of this device.

213

214 Valid indices are derived from the count returned by

215 :meth:`Device.get_device_count`. For example, if ``get_device_count()``

216 returns 2, the valid indices are 0 and 1, corresponding to GPU 0 and GPU

217 1.

218

219 The order in which NVML enumerates devices has no guarantees of

220 consistency between reboots. For that reason, it is recommended that

221 devices be looked up by their PCI ids or GPU UUID.

222

223 Note: The NVML index may not correlate with other APIs, such as the CUDA

224 device index.

225 """

226 return nvml.device_get_index(self._handle) 2 ctx1fI

227

228 @property

229 def uuid(self) -> str:

230 """

231 Retrieves the globally unique immutable UUID associated with this

232 device, as a 5 part hexadecimal string, that augments the immutable,

233 board serial identifier.

234

235 In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-``

236 prefix. If you need a `uuid` without that prefix (for example, to

237 interact with CUDA), use the `uuid_without_prefix` property.

238 """

239 return nvml.device_get_uuid(self._handle) 1 ctx1P

240

241 @property

242 def uuid_without_prefix(self) -> str:

243 """

244 Retrieves the globally unique immutable UUID associated with this

245 device, as a 5 part hexadecimal string, that augments the immutable,

246 board serial identifier.

247

248 In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-``

249 prefix. This property returns it without the prefix, to match the UUIDs

250 used in CUDA. If you need the prefix, use the `uuid` property.

251 """

252 # NVML UUIDs have a `gpu-` or `mig-` prefix. We remove that here.

253 return nvml.device_get_uuid(self._handle)[4:] 3 ctx1Ehp

254

255 @property

256 def pci_bus_id(self) -> str:

257 """

258 Retrieves the PCI bus ID of this device.

259 """

260 return self.pci_info.bus_id

261

262 @property

263 def numa_node_id(self) -> int:

264 """

265 The NUMA node of the given GPU device.

266

267 This only applies to platforms where the GPUs are NUMA nodes.

268 """

269 return nvml.device_get_numa_node_id(self._handle) 1 ctx1L

270

271 @property

272 def arch(self) -> DeviceArch:

273 """

274 :obj:`~DeviceArch` device architecture.

275

276 For example, a Tesla V100 will report ``DeviceArchitecture.name ==

277 "VOLTA"``, and RTX A6000 will report ``DeviceArchitecture.name ==

278 "AMPERE"``.

279 """

280 arch = nvml.device_get_architecture(self._handle) 2 ctx1ln

281 try: 2 ctx1ln

282 return DeviceArch(arch) 2 ctx1ln

283 except ValueError:

284 return DeviceArch.UNKNOWN

285

286 @property

287 def name(self) -> str:

288 """

289 Name of the device, e.g.: `"Tesla V100-SXM2-32GB"`

290 """

291 return nvml.device_get_name(self._handle) 1 ctx1C

292

293 @property

294 def brand(self) -> str:

295 """

296 The brand of the device.

297

298 Returns "Unknown" if the brand is unknown.

299 """

300 return _BRAND_TYPE_MAPPING.get(nvml.device_get_brand(self._handle), "Unknown") 1 ctx1z

301

302 @property

303 def serial(self) -> str:

304 """

305 Retrieves the globally unique board serial number associated with this

306 device's board.

307

308 For all products with an InfoROM.

309 """

310 return nvml.device_get_serial(self._handle) 1 ctx1D

311

312 @property

313 def module_id(self) -> int:

314 """

315 Get a unique identifier for the device module on the baseboard.

316

317 This API retrieves a unique identifier for each GPU module that exists

318 on a given baseboard. For non-baseboard products, this ID would always

319 be 0.

320 """

321 return nvml.device_get_module_id(self._handle) 1 ctx1K

322

323 @property

324 def minor_number(self) -> int:

325 """

326 The minor number of this device.

327

328 For Linux only.

329

330 The minor number is used by the Linux device driver to identify the

331 device node in ``/dev/nvidiaX``.

332 """

333 return nvml.device_get_minor_number(self._handle) 1 ctx1H

334

335 @property

336 def is_c2c_enabled(self) -> bool:

337 """

338 Whether the C2C (Chip-to-Chip) mode is enabled for this device.

339 """

340 return bool(nvml.device_get_c2c_mode_info_v(self._handle).is_c2c_enabled) 1 ctx1v

341

342 @property

343 def is_persistence_mode_enabled(self) -> bool:

344 """

345 Whether persistence mode is enabled for this device.

346

347 For Linux only.

348 """

349 return nvml.device_get_persistence_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED 1 ctx1k

350

351 @is_persistence_mode_enabled.setter

352 def is_persistence_mode_enabled(self, enabled: bool) -> None:

353 nvml.device_set_persistence_mode( 1 ctx1k

354 self._handle, 1 ctx1k

355 nvml.EnableState.FEATURE_ENABLED if enabled else nvml.EnableState.FEATURE_DISABLED 1 ctx1k

356 )

357

358 @property

359 def cuda_compute_capability(self) -> tuple[int, int]:

360 """

361 CUDA compute capability of the device, e.g.: `(7, 0)` for a Tesla V100.

362

363 Returns a tuple `(major, minor)`.

364 """

365 return nvml.device_get_cuda_compute_capability(self._handle) 1 ctx1A

366

367 def to_cuda_device(self) -> "cuda.core.Device": (empty)

368 """

369 Get the corresponding :class:`cuda.core.Device` (which is used for CUDA

370 access) for this :class:`cuda.core.system.Device` (which is used for

371 NVIDIA machine library (NVML) access).

372

373 The devices are mapped to one another by their UUID.

374

375 Returns

376 -------

377 cuda.core.Device

378 The corresponding CUDA device.

379

380 Raises

381 ------

382 RuntimeError

383 No corresponding CUDA device is found for this NVML device.

384

385 For example, on a MIG system, the physical GPU will not have an

386 available CUDA device, since it can not be used directly, even

387 though it can be enumerated from NVML.

388 """

389 from cuda.core import Device as CudaDevice 1 ctx1h

390

391 # CUDA does not have an API to get a device by its UUID, so we just

392 # search all the devices for one with a matching UUID.

393

394 for cuda_device in CudaDevice.get_all_devices(): 1 ctx1h

395 if cuda_device.uuid == self.uuid_without_prefix: 1 ctx1h

396 return cuda_device 1 ctx1h

397

398 raise RuntimeError("No corresponding CUDA device found for this NVML device.")

399

400 @classmethod (empty)

401 def get_device_count(cls) -> int:

402 """

403 Get the number of available devices.

404

405 Returns

406 -------

407 int

408 The number of available devices.

409 """

410 initialize() 3 ctx1STU

411

412 return nvml.device_get_count_v2() 3 ctx1STU

413

414 @classmethod (empty)

415 def get_all_devices(cls) -> Iterable[Device]:

416 """

417 Query the available device instances.

418

419 Returns

420 -------

421 Iterator over :obj:`~Device`

422 An iterator over available devices.

423 """

424 initialize() 41 ctx1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ

425

426 for device_id in range(nvml.device_get_count_v2()): 41 ctx1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ

427 yield cls(index=device_id) 41 ctx1tbuvdeqwlxyziABCfmDEnrFjGHoIJKLskcgMNhOPQ

428

429 #########################################################################

430 # ADDRESSING MODE

431

432 @property

433 def addressing_mode(self) -> AddressingMode | None:

434 """

435 Get the :obj:`~AddressingMode` of the device.

436 """

437 return _ADDRESSING_MODE_MAPPING.get(nvml.device_get_addressing_mode(self._handle).value, None) 1 ctx1t

438

439 #########################################################################

440 # MIG (MULTI-INSTANCE GPU) DEVICES

441

442 @property

443 def mig(self) -> MigInfo:

444 """

445 Get :obj:`~MigInfo` accessor for MIG (Multi-Instance GPU) information.

446

447 For Ampere™ or newer fully supported devices.

448 """

449 return MigInfo(self) 2 ctx1qJ

450

451 #########################################################################

452 # AFFINITY

453

454 @classmethod (empty)

455 def get_all_devices_with_cpu_affinity(cls, cpu_index: int) -> Iterable[Device]:

456 """

457 Retrieve the set of GPUs that have a CPU affinity with the given CPU number.

458

459 Supported on Linux only.

460

461 Parameters

462 ----------

463 cpu_index: int

464 The CPU index.

465

466 Returns

467 -------

468 Iterator of :obj:`~Device`

469 An iterator over available devices.

470 """

471 cdef Device device

472 for handle in nvml.system_get_topology_gpu_set(cpu_index): 1 ctx1R

473 device = Device.__new__(Device) 1 ctx1R

474 device._handle = handle 1 ctx1R

475 yield device 1 ctx1R

476

477 def get_memory_affinity(self, scope: AffinityScope | str=AffinityScope.NODE) -> list[int]: (empty)

478 """

479 Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal

480 memory affinity for the device.

481

482 For Kepler™ or newer fully supported devices.

483

484 Supported on Linux only.

485

486 If requested scope is not applicable to the target topology, the API

487 will fall back to reporting the memory affinity for the immediate non-I/O

488 ancestor of the device.

489

490 Parameters

491 ----------

492 scope: AffinityScope | str, optional

493 The scope of the affinity query. Must be one of the values of

494 :class:`AffinityScope`. Default is :attr:`AffinityScope.NODE`.

495

496 Returns

497 -------

498 list[int]

499 A list of indices of NUMA nodes or CPU sockets with the ideal memory

500 affinity for the device.

501 """

502 try: 1 ctx1b

503 scope = _AFFINITY_SCOPE_MAPPING[scope] 1 ctx1b

504 except KeyError:

505 raise ValueError(

506 f"Invalid affinity scope: {scope}. "

507 f"Must be one of {list(AffinityScope.__members__.values())}"

508 ) from None

509 return _unpack_bitmask( 1 ctx1b

510 nvml.device_get_memory_affinity( 1 ctx1b

511 self._handle, 1 ctx1b

512 <unsigned int>ceil(cpu_count() / 64), 1 ctx1b

513 scope, 1 ctx1b

514 )

515 )

516

517 def get_cpu_affinity(self, scope: AffinityScope | str=AffinityScope.NODE) -> list[int]: (empty)

518 """

519 Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal

520 CPU affinity for the device.

521

522 For Kepler™ or newer fully supported devices.

523

524 Supported on Linux only.

525

526 If requested scope is not applicable to the target topology, the API

527 will fall back to reporting the memory affinity for the immediate non-I/O

528 ancestor of the device.

529

530 Parameters

531 ----------

532 scope: AffinityScope | str, optional

533 The scope of the affinity query. Must be one of the values of

534 :class:`AffinityScope`. Default is :attr:`AffinityScope.NODE`.

535

536 Returns

537 -------

538 list[int]

539 A list of indices of NUMA nodes or CPU sockets with the ideal memory

540 affinity for the device.

541 """

542 try: 3 ctx1biR

543 scope = _AFFINITY_SCOPE_MAPPING[scope] 3 ctx1biR

544 except KeyError:

545 raise ValueError(

546 f"Invalid affinity scope: {scope}. "

547 f"Must be one of {list(AffinityScope.__members__.values())}"

548 ) from None

549 return _unpack_bitmask( 3 ctx1biR

550 nvml.device_get_cpu_affinity_within_scope( 3 ctx1biR

551 self._handle, 3 ctx1biR

552 <unsigned int>ceil(cpu_count() / 64), 3 ctx1biR

553 scope, 3 ctx1biR

554 )

555 )

556

557 def set_cpu_affinity(self): (empty)

558 """

559 Sets the ideal affinity for the calling thread and device.

560

561 For Kepler™ or newer fully supported devices.

562

563 Supported on Linux only.

564 """

565 nvml.device_set_cpu_affinity(self._handle)

566

567 def clear_cpu_affinity(self): (empty)

568 """

569 Clear all affinity bindings for the calling thread.

570

571 For Kepler™ or newer fully supported devices.

572

573 Supported on Linux only.

574 """

575 nvml.device_clear_cpu_affinity(self._handle)

576

577 #########################################################################

578 # CLOCK

579 # See external class definitions in _clock.pxi

580

581 def get_clock(self, clock_type: ClockType | str) -> ClockInfo: (empty)

582 """

583 :obj:`~_device.ClockInfo` object to get information about and manage a specific clock on a device.

584 """

585 return ClockInfo(self._handle, clock_type) 1 ctx1e

586

587 @property

588 def is_auto_boosted_clocks_enabled(self) -> tuple[bool, bool]:

589 """

590 Retrieve the current state of auto boosted clocks on a device.

591

592 For Kepler™ or newer fully supported devices.

593

594 Auto Boosted clocks are enabled by default on some hardware, allowing

595 the GPU to run at higher clock rates to maximize performance as thermal

596 limits allow.

597

598 On Pascal™ and newer hardware, Auto Boosted clocks are controlled

599 through application clocks. Use :meth:`set_application_clocks` and

600 :meth:`reset_application_clocks` to control Auto Boost behavior.

601

602 Returns

603 -------

604 bool

605 The current state of Auto Boosted clocks

606 bool

607 The default Auto Boosted clocks behavior

608

609 """

610 current, default = nvml.device_get_auto_boosted_clocks_enabled(self._handle) 1 ctx1u

611 return current == nvml.EnableState.FEATURE_ENABLED, default == nvml.EnableState.FEATURE_ENABLED

612

613 @property

614 def current_clock_event_reasons(self) -> list[ClocksEventReasons]:

615 """

616 Retrieves the current :obj:`~ClocksEventReasons`.

617

618 For all fully supported products.

619 """

620 cdef uint64_t[1] reasons

621 reasons[0] = nvml.device_get_current_clocks_event_reasons(self._handle) 1 ctx1d

622 output_reasons = [] 1 ctx1d

623 for reason in _unpack_bitmask(reasons): 1 ctx1d

624 try:

625 output_reason = _CLOCKS_EVENT_REASONS_MAPPING[1 << reason]

626 except KeyError:

627 raise ValueError(f"Unknown clock event reason bit: {1 << reason}")

628 output_reasons.append(output_reason)

629 return output_reasons 1 ctx1d

630

631 @property

632 def supported_clock_event_reasons(self) -> list[ClocksEventReasons]:

633 """

634 Retrieves supported :obj:`~ClocksEventReasons` that can be returned by

635 :meth:`get_current_clock_event_reasons`.

636

637 For all fully supported products.

638

639 This method is not supported in virtual machines running virtual GPU (vGPU).

640 """

641 cdef uint64_t[1] reasons

642 reasons[0] = nvml.device_get_supported_clocks_event_reasons(self._handle) 1 ctx1d

643 output_reasons = [] 1 ctx1d

644 for reason in _unpack_bitmask(reasons): 1 ctx1d

645 try: 1 ctx1d

646 output_reason = _CLOCKS_EVENT_REASONS_MAPPING[1 << reason] 1 ctx1d

647 except KeyError:

648 raise ValueError(f"Unknown clock event reason bit: {1 << reason}")

649 output_reasons.append(output_reason) 1 ctx1d

650 return output_reasons 1 ctx1d

651

652 ##########################################################################

653 # COOLER

654 # See external class definitions in _cooler.pxi

655

656 @property

657 def cooler(self) -> CoolerInfo:

658 """

659 :obj:`~_device.CoolerInfo` object with cooler information for the device.

660 """

661 return CoolerInfo(nvml.device_get_cooler_info(self._handle))

662

663 ##########################################################################

664 # DEVICE ATTRIBUTES

665 # See external class definitions in _device_attributes.pxi

666

667 @property

668 def attributes(self) -> DeviceAttributes:

669 """

670 :obj:`~_device.DeviceAttributes` object with various device attributes.

671

672 For Ampere™ or newer fully supported devices. Only available on Linux

673 systems.

674 """

675 return DeviceAttributes(nvml.device_get_attributes_v2(self._handle)) 1 ctx1x

676

677 #########################################################################

678 # DISPLAY

679

680 @property

681 def is_display_connected(self) -> bool:

682 """

683 The display mode for this device.

684

685 Indicates whether a physical display (e.g. monitor) is currently connected to

686 any of the device's connectors.

687 """

688 return nvml.device_get_display_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED 1 ctx1r

689

690 @property

691 def is_display_active(self) -> bool:

692 """

693 The display active status for this device.

694

695 Indicates whether a display is initialized on the device. For example,

696 whether X Server is attached to this device and has allocated memory for

697 the screen.

698

699 Display can be active even when no monitor is physically attached.

700 """

701 return nvml.device_get_display_active(self._handle) == nvml.EnableState.FEATURE_ENABLED 1 ctx1r

702

703 ##########################################################################

704 # EVENTS

705 # See external class definitions in _event.pxi

706

707 def register_events(self, events: EventType | str | list[EventType | str]) -> DeviceEvents: (empty)

708 """

709 Starts recording events on this device.

710

711 For Fermi™ or newer fully supported devices. For Linux only.

712

713 ECC events are available only on ECC-enabled devices (see

714 :meth:`Device.get_total_ecc_errors`). Power capping events are

715 available only on Power Management enabled devices (see

716 :meth:`Device.get_power_management_mode`).

717

718 This call starts recording of events on specific device. All events

719 that occurred before this call are not recorded. Wait for events using

720 the :meth:`DeviceEvents.wait` method on the result.

721

722 Examples

723 --------

724 >>> device = Device(index=0)

725 >>> events = device.register_events([

726 ... EventType.XID_CRITICAL_ERROR,

727 ... ])

728 >>> while event := events.wait(timeout_ms=10000):

729 ... print(f"Event {event.event_type} occurred on device {event.device.uuid}")

730

731 Parameters

732 ----------

733 events: EventType, str, or list of EventType or str

734 The event type or list of event types to register for this device.

735

736 Returns

737 -------

738 :obj:`~_device.DeviceEvents`

739 An object representing the registered events. Call

740 :meth:`~_device.DeviceEvents.wait` on this object to wait for events.

741

742 Raises

743 ------

744 :class:`cuda.core.system.NotSupportedError`

745 None of the requested event types are registered.

746 """

747 return DeviceEvents(self._handle, events) 1 ctx1g

748

749 def get_supported_event_types(self) -> list[EventType]: (empty)

750 """

751 Get the list of event types supported by this device.

752

753 For Fermi™ or newer fully supported devices. For Linux only (returns an

754 empty list on Windows).

755

756 Returns

757 -------

758 list[EventType]

759 The list of supported event types.

760 """

761 cdef uint64_t[1] bitmask

762 bitmask[0] = nvml.device_get_supported_event_types(self._handle) 1 ctx1g

763 events = [] 1 ctx1g

764 for ev in _unpack_bitmask(bitmask): 1 ctx1g

765 try: 1 ctx1g

766 ev_enum = _EVENT_TYPE_MAPPING[1 << ev] 1 ctx1g

767 except KeyError:

768 raise ValueError(f"Unknown event type bit: {1 << ev}")

769 events.append(ev_enum) 1 ctx1g

770 return events 1 ctx1g

771

772 ##########################################################################

773 # FAN

774 # See external class definitions in _fan.pxi

775

776 def get_fan(self, fan: int = 0) -> FanInfo: (empty)

777 """

778 :obj:`~_device.FanInfo` object to get information and manage a specific fan on a device.

779 """

780 if fan < 0 or fan >= self.num_fans:

781 raise ValueError(f"Fan index {fan} is out of range [0, {self.num_fans})")

782 return FanInfo(self._handle, fan)

783

784 @property

785 def num_fans(self) -> int:

786 """

787 The number of fans on the device.

788 """

789 return nvml.device_get_num_fans(self._handle) 2 ctx1wF

790

791 ##########################################################################

792 # FIELD VALUES

793 # See external class definitions in _field_values.pxi

794

795 def get_field_values(self, field_ids: list[int | tuple[int, int]]) -> FieldValues: (empty)

796 """

797 Get multiple field values from the device.

798

799 Each value specified can raise its own exception. That exception will

800 be raised when attempting to access the corresponding ``value`` from the

801 returned :obj:`~_device.FieldValues` container.

802

803 To confirm that there are no exceptions in the entire container, call

804 :meth:`~_device.FieldValues.validate`.

805

806 Parameters

807 ----------

808 field_ids: list[int | tuple[int, int]]

809 List of field IDs to query.

810

811 Each item may be either a single value from the :class:`FieldId`

812 enum, or a pair of (:class:`FieldId`, scope ID).

813

814 Returns

815 -------

816 :obj:`~_device.FieldValues`

817 Container of field values corresponding to the requested field IDs.

818 """

819 # Passing a field_ids array of length 0 raises an InvalidArgumentError,

820 # so avoid that.

821 if len(field_ids) == 0: 1 ctx1j

822 return FieldValues(nvml.FieldValue(0)) 1 ctx1j

823

824 return FieldValues(nvml.device_get_field_values(self._handle, field_ids)) 1 ctx1j

825

826 def clear_field_values(self, field_ids: list[int | tuple[int, int]]) -> None: (empty)

827 """

828 Clear multiple field values from the device.

829

830 Parameters

831 ----------

832 field_ids: list[int | tuple[int, int]]

833 List of field IDs to clear.

834

835 Each item may be either a single value from the :class:`FieldId`

836 enum, or a pair of (:class:`FieldId`, scope ID).

837 """

838 # Passing a field_ids array of length 0 raises an InvalidArgumentError,

839 # so avoid that.

840 if len(field_ids) == 0: 1 ctx1j

841 return

842

843 nvml.device_clear_field_values(self._handle, field_ids) 1 ctx1j

844

845 ##########################################################################

846 # INFOROM

847 # See external class definitions in _inforom.pxi

848

849 @property

850 def inforom(self) -> InforomInfo:

851 """

852 :obj:`~_device.InforomInfo` object with InfoROM information.

853

854 For all products with an InfoROM.

855 """

856 return InforomInfo(self) 1 ctx1G

857

858 ##########################################################################

859 # MEMORY

860 # See external class definitions in _memory.pxi

861

862 @property

863 def bar1_memory_info(self) -> BAR1MemoryInfo:

864 """

865 :obj:`~_device.BAR1MemoryInfo` object with BAR1 memory information.

866

867 BAR1 is used to map the FB (device memory) so that it can be directly

868 accessed by the CPU or by 3rd party devices (peer-to-peer on the PCIE

869 bus).

870 """

871 return BAR1MemoryInfo(nvml.device_get_bar1_memory_info(self._handle)) 1 ctx1y

872

873 @property

874 def memory_info(self) -> MemoryInfo:

875 """

876 :obj:`~_device.MemoryInfo` object with memory information.

877 """

878 return MemoryInfo(nvml.device_get_memory_info_v2(self._handle)) 1 ctx1B

879

880 ##########################################################################

881 # NVLINK

882 # See external class definitions in _nvlink.pxi

883

884 def get_nvlink(self, link: int) -> NvlinkInfo: (empty)

885 """

886 Get :obj:`~NvlinkInfo` about this device.

887

888 For devices with NVLink support.

889 """

890 if link < 0 or link >= NvlinkInfo.max_links: 1 ctx1s

891 raise ValueError(f"Link index {link} is out of range [0, {NvlinkInfo.max_links})")

892 return NvlinkInfo(self, link) 1 ctx1s

893

894 ##########################################################################

895 # PCI INFO

896 # See external class definitions in _pci_info.pxi

897

898 @property

899 def pci_info(self) -> PciInfo:

900 """

901 :obj:`~_device.PciInfo` object with the PCI attributes of this device.

902

903 Non-physical devices, such as MIG devices, may not have PCI attributes.

904 In that case, this property will raise a `RuntimeError`.

905 """

906 try: 4 ctx1fmhp

907 pci_info = nvml.device_get_pci_info_ext(self._handle) 4 ctx1fmhp

908 except nvml.InvalidArgumentError:

909 raise RuntimeError("This device does not have PCI attributes") from None

910 else:

911 return PciInfo(pci_info, self._handle) 4 ctx1fmhp

912

913 ##########################################################################

914 # PERFORMANCE

915 # See external class definitions in _performance.pxi

916

917 @property

918 def performance_state(self) -> int | None:

919 """

920 The current performance state of the device.

921

922 For Fermi™ or newer fully supported devices.

923

924 Returns

925 -------

926 int | None

927 The current performance state of the device, as an integer between 0 and 15,

928 where 0 is maximum performance and higher numbers are lower performance.

929 Returns `None` if the performance state is unknown.

930 """

931 return _pstate_to_int(nvml.device_get_performance_state(self._handle)) 2 ctx1ec

932

933 @property

934 def dynamic_pstates_info(self) -> GpuDynamicPstatesInfo:

935 """

936 :obj:`~_device.GpuDynamicPstatesInfo` object with performance monitor samples from the associated subdevice.

937 """

938 return GpuDynamicPstatesInfo(nvml.device_get_dynamic_pstates_info(self._handle)) 1 ctx1c

939

940 @property

941 def supported_pstates(self) -> list[int]:

942 """

943 Get all supported Performance States (P-States) for the device.

944

945 The returned list contains a contiguous list of valid P-States supported by

946 the device.

947

948 Return

949 ------

950 list[int]

951 A list of supported performance state of the device, as an integer

952 between 0 and 15, where 0 is maximum performance and higher numbers

953 are lower performance.

954 """

955 # From nvml.h:

956 # The returned array would contain a contiguous list of valid P-States

957 # supported by the device. If the number of supported P-States is fewer

958 # than the size of the array supplied missing elements would contain \a

959 # NVML_PSTATE_UNKNOWN.

960

961 pstates = [] 1 ctx1c

962 for pstate in nvml.device_get_supported_performance_states(self._handle): 1 ctx1c

963 pstate_value = _pstate_to_int(pstate) 1 ctx1c

964 if pstate_value is not None: 1 ctx1c

965 pstates.append(pstate_value) 1 ctx1c

966 return pstates 1 ctx1c

967

968 ##########################################################################

969 # PROCESS

970 # See external class definitions in _process.pxi

971

972 @property

973 def compute_running_processes(self) -> list[ProcessInfo]:

974 """

975 Get information about processes with a compute context on a device

976

977 For Fermi™ or newer fully supported devices.

978

979 This function returns information only about compute running processes

980 (e.g. CUDA application which have active context). Any graphics

981 applications (e.g. using OpenGL, DirectX) won't be listed by this

982 function.

983

984 Keep in mind that information returned by this call is dynamic and the

985 number of elements might change in time.

986

987 In MIG mode, if device handle is provided, the API returns aggregate

988 information, only if the caller has appropriate privileges. Per-instance

989 information can be queried by using specific MIG device handles.

990 Querying per-instance information using MIG device handles is not

991 supported if the device is in vGPU Host virtualization mode.

992 """

993 return [ProcessInfo(self, proc) for proc in nvml.device_get_compute_running_processes_v3(self._handle)] 2 ctx1qQ

994

995 ##########################################################################

996 # REPAIR STATUS

997 # See external class definitions in _repair_status.pxi

998

999 @property

1000 def repair_status(self) -> RepairStatus:

1001 """

1002 :obj:`~_device.RepairStatus` object with TPC/Channel repair status.

1003

1004 For Ampere™ or newer fully supported devices.

1005 """

1006 return RepairStatus(self._handle) 1 ctx1M

1007

1008 ##########################################################################

1009 # TEMPERATURE

1010 # See external class definitions in _temperature.pxi

1011

1012 @property

1013 def temperature(self) -> Temperature:

1014 """

1015 :obj:`~_device.Temperature` object with temperature information for the device.

1016 """

1017 return Temperature(self._handle) 1 ctx1N

1018

1019 #######################################################################

1020 # TOPOLOGY

1021

1022 def get_topology_nearest_gpus(self, level: GpuTopologyLevel | str) -> Iterable[Device]: (empty)

1023 """

1024 Retrieve the GPUs that are nearest to this device at a specific interconnectivity level.

1025

1026 Supported on Linux only.

1027

1028 Parameters

1029 ----------

1030 level: :class:`GpuTopologyLevel`

1031 The topology level.

1032

1033 Returns

1034 -------

1035 Iterable of :class:`Device`

1036 The nearest devices at the given topology level.

1037 """

1038 cdef Device device

1039 try: 1 ctx1o

1040 level = _GPU_TOPOLOGY_LEVEL_MAPPING[level] 1 ctx1o

1041 except KeyError:

1042 raise ValueError(

1043 f"Invalid topology level: {level}. "

1044 f"Must be one of {list(GpuTopologyLevel.__members__.values())}"

1045 ) from None

1046 for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): 1 ctx1o

1047 device = Device.__new__(Device)

1048 device._handle = handle

1049 yield device

1050

1051 #######################################################################

1052 # UTILIZATION

1053

1054 @property

1055 def utilization(self) -> Utilization:

1056 """

1057 Retrieves the current :obj:`~Utilization` rates for the device's major

1058 subsystems.

1059

1060 For Fermi™ or newer fully supported devices.

1061

1062 Note: During driver initialization when ECC is enabled one can see high

1063 GPU and Memory Utilization readings. This is caused by ECC Memory

1064 Scrubbing mechanism that is performed during driver initialization.

1065

1066 Note: On MIG-enabled GPUs, querying device utilization rates is not

1067 currently supported.

1068

1069 Returns

1070 -------

1071 Utilization

1072 An object containing the current utilization rates for the device.

1073 """

1074 return Utilization(nvml.device_get_utilization_rates(self._handle)) 1 ctx1O

1075

1076

1077def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel: (empty)

1078 """

1079 Retrieve the common ancestor for two devices.

1080

1081 For Linux only.

1082

1083 Parameters

1084 ----------

1085 device1: :class:`Device`

1086 The first device.

1087 device2: :class:`Device`

1088 The second device.

1089

1090 Returns

1091 -------

1092 :class:`GpuTopologyLevel`

1093 The common ancestor level of the two devices.

1094 """

1095 return _GPU_TOPOLOGY_LEVEL_INV_MAPPING[

1096 nvml.device_get_topology_common_ancestor(

1097 device1._handle,

1098 device2._handle,

1099 )

1100 ]

1101

1102

1103def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex | str) -> GpuP2PStatus: (empty)

1104 """

1105 Retrieve the P2P status between two devices.

1106

1107 Parameters

1108 ----------

1109 device1: :class:`Device`

1110 The first device.

1111 device2: :class:`Device`

1112 The second device.

1113 index: :class:`GpuP2PCapsIndex` | str

1114 The P2P capability index being looked for between ``device1`` and ``device2``.

1115

1116 Returns

1117 -------

1118 :class:`GpuP2PStatus`

1119 The P2P status between the two devices.

1120 """

1121 try:

1122 index_enum = _GPU_P2P_CAPS_INDEX_MAPPING[index]

1123 except KeyError:

1124 raise ValueError(

1125 f"Invalid P2P caps index: {index}. "

1126 f"Must be one of {list(GpuP2PCapsIndex.__members__.values())}"

1127 ) from None

1128 return _GPU_P2P_STATUS_MAPPING.get(

1129 nvml.device_get_p2p_status(

1130 device1._handle,

1131 device2._handle,

1132 index_enum,

1133 ),

1134 GpuP2PStatus.UNKNOWN

1135 )

1136

1137

1138__all__ = [ (empty)

1139 "Device",

1140 "get_p2p_status",

1141 "get_topology_common_ancestor",

1142 "NvlinkInfo",

1143]

Coverage for cuda / core / system / _device.pyx: 75.29%

255 statements