Coverage for cuda/core/_memory/_managed_memory_ops.pyx: 92.22%

180 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-13 01:38 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from __future__ import annotations 

6  

7from collections.abc import Sequence 

8from typing import TYPE_CHECKING 

9  

10IF CUDA_CORE_BUILD_MAJOR >= 13: 

11 from libcpp.vector cimport vector 

12  

13from cuda.bindings cimport cydriver 

14from cuda.core._memory._buffer cimport Buffer 

15from cuda.core._resource_handles cimport as_cu 

16from cuda.core._stream cimport Stream, Stream_accept 

17from cuda.core._utils.cuda_utils cimport HANDLE_RETURN 

18  

19from cuda.core._host import Host 

20from cuda.core._utils.cuda_utils import driver 

21from cuda.core._memory._managed_location import _coerce_location 

22  

23if TYPE_CHECKING: 

24 from cuda.core._graph import GraphBuilder 

25 from cuda.core._device import Device 

26  

27cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) 

28cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) 

29cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) 

30  

31cdef set _ADVICE_IGNORES_LOCATION = { 

32 driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY, 

33 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_READ_MOSTLY, 

34 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION, 

35} 

36  

37cdef dict _ADVICE_ALLOWED_LOCTYPES = { 

38 driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY: _DEVICE_HOST_NUMA, 

39 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_READ_MOSTLY: _DEVICE_HOST_NUMA, 

40 driver.CUmem_advise.CU_MEM_ADVISE_SET_PREFERRED_LOCATION: _ALL_LOCATION_TYPES, 

41 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: _DEVICE_HOST_NUMA, 

42 driver.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY: _DEVICE_HOST_ONLY, 

43 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_ACCESSED_BY: _DEVICE_HOST_ONLY, 

44} 

45  

46  

47cdef void _require_managed_buffer(Buffer self, str what): 

48 # Buffer.is_managed handles both pointer-attribute and memory-resource 

49 # paths (e.g. pool-allocated managed memory whose pointer attribute 

50 # does not advertise CU_POINTER_ATTRIBUTE_IS_MANAGED). 

51 if not self.is_managed: 1ablmnkijepqtuhdocfv

52 raise ValueError(f"{what} requires a managed-memory allocation") 1v

53  

54  

55cdef tuple _coerce_batch_buffers(object buffers, str what): 

56 """Coerce ``buffers`` to a tuple[Buffer, ...]; rejects a single Buffer. 

57  

58 For single-buffer operations, use the corresponding ManagedBuffer 

59 instance method instead. 

60 """ 

61 cdef Buffer buf 

62 cdef list out 

63 if isinstance(buffers, Buffer): 1abcfrswxy

64 raise TypeError( 1wxy

65 f"{what}: pass a sequence of Buffers; for a single buffer use " 1wxy

66 f"the ManagedBuffer instance method" 

67 ) 

68 if isinstance(buffers, Sequence): 1abcfrs

69 if not buffers: 1abcfrs

70 raise ValueError(f"{what}: empty buffers sequence") 

71 out = [] 1abcfrs

72 for t in buffers: 1abcfrs

73 buf = <Buffer?>t 1abcfrs

74 out.append(buf) 1abcfrs

75 return tuple(out) 1abcfrs

76 raise TypeError( 

77 f"{what}: buffers must be a sequence of Buffer, " 

78 f"got {type(buffers).__name__}" 

79 ) 

80  

81  

82cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what): 

83 cdef object coerced 

84 if isinstance(location, Sequence): 1abcfrs

85 if len(location) != n: 1crs

86 raise ValueError( 1rs

87 f"{what}: location length {len(location)} does not match " 1rs

88 f"targets length {n}" 1rs

89 ) 

90 return tuple(_coerce_location(loc, allow_none=allow_none) for loc in location) 1c

91 coerced = _coerce_location(location, allow_none=allow_none) 1abf

92 return tuple([coerced] * n) 1abf

93  

94  

95IF CUDA_CORE_BUILD_MAJOR >= 13: 

96 # Convert a _LocSpec dataclass to a cydriver.CUmemLocation struct. 

97 cdef inline cydriver.CUmemLocation _to_cumemlocation(object loc): 

98 cdef cydriver.CUmemLocation out 

99 cdef str kind = loc.kind 1ablmnkijepqhdcf

100 if kind == "device": 1ablmnkijepqhdcf

101 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE 1ablmnkijepqdcf

102 out.id = <int>loc.id 1ablmnkijepqdcf

103 elif kind == "host": 1bkehdc

104 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST 1bkedc

105 out.id = 0 1bkedc

106 elif kind == "host_numa": 1h

107 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA 1h

108 out.id = <int>loc.id 1h

109 else: # host_numa_current 

110 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT 

111 out.id = 0 

112 return out 1ablmnkijepqhdcf

113ELSE: 

114 # CUDA 12 cuMemPrefetchAsync takes a device ordinal (-1 = host). 

115 cdef inline int _to_legacy_device(object loc) except? -2: 

116 cdef str kind = loc.kind 

117 if kind == "device": 

118 return <int>loc.id 

119 if kind == "host": 

120 return -1 

121 raise RuntimeError( 

122 "Host(numa_id=...) / Host.numa_current() require both cuda-bindings 13.0+ " 

123 "and a CUDA 13+ runtime driver; use Host() instead" 

124 ) 

125  

126  

127def discard_batch(stream: Stream | GraphBuilder, buffers: Sequence[Buffer]) -> None: 

128 """Discard a batch of managed-memory ranges. 

129  

130 Requires CUDA 13+. For a single buffer, use 

131 :meth:`ManagedBuffer.discard` instead. 

132  

133 Parameters 

134 ---------- 

135 stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` 

136 Stream for the asynchronous discard. First positional, required 

137 (mirrors :func:`launch`). 

138 buffers : Sequence[:class:`Buffer`] 

139 Two or more managed allocations to discard. Resident pages are 

140 released without prefetching new contents; subsequent access is 

141 satisfied by lazy migration. 

142  

143 Raises 

144 ------ 

145 NotImplementedError 

146 On a CUDA 12 build of ``cuda.core``. 

147 """ 

148 cdef tuple bufs = _coerce_batch_buffers(buffers, "discard_batch") 1aw

149 cdef Stream s = Stream_accept(stream) 1a

150  

151 cdef Buffer buf 

152 for buf in bufs: 1a

153 _require_managed_buffer(buf, "discard_batch") 1a

154  

155 _do_batch_discard(bufs, s) 1a

156  

157  

158def _do_single_discard_py(Buffer buf, stream: Stream | GraphBuilder | None) -> None: 

159 """Internal: single-buffer discard for ManagedBuffer.discard().""" 

160 _require_managed_buffer(buf, "discard") 1ij

161 cdef Stream s = Stream_accept(stream) 1ij

162 # No single-range cuMemDiscard exists; route through the batched call 

163 # with count=1. 

164 cdef tuple bufs = (buf,) 1ij

165 _do_batch_discard(bufs, s) 1ij

166  

167  

168cdef void _do_batch_discard(tuple bufs, Stream s): 

169 IF CUDA_CORE_BUILD_MAJOR >= 13: 

170 cdef Py_ssize_t n = len(bufs) 1aij

171 cdef cydriver.CUstream hstream = as_cu(s._h_stream) 1aij

172 cdef vector[cydriver.CUdeviceptr] ptrs 

173 cdef vector[size_t] sizes 

174 ptrs.resize(n) 1aij

175 sizes.resize(n) 1aij

176 cdef Buffer buf 

177 cdef Py_ssize_t i 

178 for i in range(n): 1aij

179 buf = <Buffer>bufs[i] 1aij

180 ptrs[i] = as_cu(buf._h_ptr) 1aij

181 sizes[i] = buf._size 1aij

182 with nogil: 1aij

183 HANDLE_RETURN(cydriver.cuMemDiscardBatchAsync( 1aij

184 ptrs.data(), sizes.data(), <size_t>n, 0, hstream, 

185 )) 

186 ELSE: 

187 raise NotImplementedError( 

188 "discard requires a CUDA 13 build of cuda.core" 

189 ) 

190  

191  

192def _advise_one(Buffer buf, advice: driver.CUmem_advise, location: Device | Host | None) -> None: 

193 """Internal: apply managed-memory advice to a single buffer. 

194  

195 Used by :class:`ManagedBuffer` property setters. Not part of the 

196 public API. 

197 """ 

198 _require_managed_buffer(buf, "advise") 1lmnktuhdov

199 if not isinstance(advice, driver.CUmem_advise): 1lmnktuhdo

200 raise TypeError( 

201 f"advice must be a cuda.bindings.driver.CUmem_advise value, " 

202 f"got {type(advice).__name__}" 

203 ) 

204 cdef frozenset allowed_kinds = _ADVICE_ALLOWED_LOCTYPES.get(advice) 1lmnktuhdo

205 if allowed_kinds is None: 1lmnktuhdo

206 raise ValueError(f"Unsupported advice value: {advice!r}") 

207 cdef bint allow_none = advice in _ADVICE_IGNORES_LOCATION 1lmnktuhdo

208 cdef object loc = _coerce_location(location, allow_none=allow_none) 1lmnktuhdo

209 if loc is not None and loc.kind not in allowed_kinds: 1lmnktuhdo

210 raise ValueError( 1ktu

211 f"advise {advice.name} does not support location_type='{loc.kind}'" 1ktu

212 ) 

213 _do_single_advise(buf, advice, loc, allow_none) 1lmnkhdo

214  

215  

216cdef void _do_single_advise(Buffer buf, object advice_value, object loc, bint allow_none): 

217 cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) 1lmnkhdo

218 cdef size_t nbytes = buf._size 1lmnkhdo

219 cdef cydriver.CUmem_advise advice_enum = <cydriver.CUmem_advise>(<int>int(advice_value)) 1lmnkhdo

220 IF CUDA_CORE_BUILD_MAJOR >= 13: 

221 cdef cydriver.CUmemLocation cu_loc 

222 if loc is None: 1lmnkhdo

223 # Driver ignores location for read_mostly / unset_preferred_location 

224 # advice values but still validates the CUmemLocation; pass a 

225 # host placeholder. 

226 cu_loc.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST 1kdo

227 cu_loc.id = 0 1kdo

228 else: 

229 cu_loc = _to_cumemlocation(loc) 1lmnkhd

230 with nogil: 1lmnkhdo

231 HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, cu_loc)) 1lmnkhdo

232 ELSE: 

233 cdef int dev_int = -1 if loc is None else _to_legacy_device(loc) 

234 with nogil: 

235 HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, dev_int)) 

236  

237  

238def prefetch_batch( 

239 stream: Stream | GraphBuilder, 

240 buffers: Sequence[Buffer], 

241 locations: Device | Host | Sequence[Device | Host], 

242) -> None: 

243 """Prefetch a batch of managed-memory ranges to target locations. 

244  

245 Requires CUDA 13+. For a single buffer, use 

246 :meth:`ManagedBuffer.prefetch` instead. 

247  

248 Parameters 

249 ---------- 

250 stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` 

251 Stream for the asynchronous prefetch. First positional, required 

252 (mirrors :func:`launch`). 

253 buffers : Sequence[:class:`Buffer`] 

254 Two or more managed allocations to operate on. 

255 locations : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...] 

256 Target location(s). A single location applies to all buffers; a 

257 sequence must match ``len(buffers)``. 

258  

259 Notes 

260 ----- 

261 On a CUDA 12 build, falls back to a Python-level loop calling 

262 ``cuMemPrefetchAsync`` per buffer (no batched driver entry point on 

263 CUDA 12). CUDA 13 builds use ``cuMemPrefetchBatchAsync`` directly. 

264 """ 

265 cdef tuple bufs = _coerce_batch_buffers(buffers, "prefetch_batch") 1abcfsy

266 cdef Py_ssize_t n = len(bufs) 1abcfs

267 cdef tuple locs = _broadcast_locations(locations, n, False, "prefetch_batch") 1abcfs

268 cdef Stream s = Stream_accept(stream) 1abcf

269  

270 cdef Buffer buf 

271 for buf in bufs: 1abcf

272 _require_managed_buffer(buf, "prefetch_batch") 1abcf

273  

274 _do_batch_prefetch(bufs, locs, s) 1abcf

275  

276  

277def _do_single_prefetch_py(Buffer buf, location: Device | Host | None, stream: Stream | GraphBuilder | None) -> None: 

278 """Internal: single-buffer prefetch for ManagedBuffer.prefetch(). 

279  

280 Uses cuMemPrefetchAsync (works on CUDA 12 and 13). 

281 """ 

282 _require_managed_buffer(buf, "prefetch") 1ijepqtuv

283 cdef object loc = _coerce_location(location, allow_none=False) 1ijepqtu

284 cdef Stream s = Stream_accept(stream) 1ijepq

285 _do_single_prefetch(buf, loc, s) 1ijepq

286  

287  

288cdef void _do_single_prefetch(Buffer buf, object loc, Stream s): 

289 cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) 1ijepq

290 cdef size_t nbytes = buf._size 1ijepq

291 cdef cydriver.CUstream hstream = as_cu(s._h_stream) 1ijepq

292 IF CUDA_CORE_BUILD_MAJOR >= 13: 

293 cdef cydriver.CUmemLocation cu_loc = _to_cumemlocation(loc) 1ijepq

294 with nogil: 1ijepq

295 HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, cu_loc, 0, hstream)) 1ijepq

296 ELSE: 

297 cdef int dev_int = _to_legacy_device(loc) 

298 with nogil: 

299 HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, dev_int, hstream)) 

300  

301  

302IF CUDA_CORE_BUILD_MAJOR >= 13: 

303 # Function-pointer type for cuMemPrefetchBatchAsync / 

304 # cuMemDiscardAndPrefetchBatchAsync; both have identical signatures. 

305 ctypedef cydriver.CUresult (*_BatchPrefetchFn)( 

306 cydriver.CUdeviceptr*, size_t*, size_t, 

307 cydriver.CUmemLocation*, size_t*, size_t, 

308 unsigned long long, cydriver.CUstream, 

309 ) except ?cydriver.CUDA_ERROR_NOT_FOUND nogil 

310  

311  

312 def _read_preferred_location_v2(Buffer buf) -> Device | Host | None: 

313 """Internal: read preferred_location with full NUMA detail. 

314  

315 Bypasses cuda.bindings.driver.cuMemRangeGetAttribute (whose 

316 attribute allowlist doesn't yet include the cu13 _TYPE / _ID 

317 attributes) by calling cydriver directly. 

318  

319 Returns Device | Host | None. 

320 """ 

321 cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) 1hd

322 cdef size_t nbytes = buf._size 1hd

323 cdef int loc_type = 0 1hd

324 cdef int loc_id = 0 1hd

325 with nogil: 1hd

326 HANDLE_RETURN(cydriver.cuMemRangeGetAttribute( 1hd

327 <void*>&loc_type, sizeof(int), 

328 cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE, 

329 cu_ptr, nbytes, 

330 )) 

331 HANDLE_RETURN(cydriver.cuMemRangeGetAttribute( 1hd

332 <void*>&loc_id, sizeof(int), 

333 cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID, 

334 cu_ptr, nbytes, 

335 )) 

336 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: 1hd

337 from cuda.core._device import Device 1d

338 return Device(loc_id) 1d

339 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST: 1hd

340 return Host() 1d

341 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA: 1hd

342 return Host(numa_id=loc_id) 

343 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: 1hd

344 return Host.numa_current() 

345 return None # CU_MEM_LOCATION_TYPE_INVALID — no preferred location 1hd

346  

347  

348 cdef void _do_batch_prefetch_op(tuple bufs, tuple locs, Stream s, _BatchPrefetchFn fn): 

349 """Shared body for batched prefetch / discard-and-prefetch.""" 

350 cdef Py_ssize_t n = len(bufs) 1abecf

351 cdef cydriver.CUstream hstream = as_cu(s._h_stream) 1abecf

352 cdef vector[cydriver.CUdeviceptr] ptrs 

353 cdef vector[size_t] sizes 

354 cdef vector[cydriver.CUmemLocation] loc_arr 

355 cdef vector[size_t] loc_indices 

356 ptrs.resize(n) 1abecf

357 sizes.resize(n) 1abecf

358 loc_arr.resize(n) 1abecf

359 loc_indices.resize(n) 1abecf

360 cdef Buffer buf 

361 cdef Py_ssize_t i 

362 for i in range(n): 1abecf

363 buf = <Buffer>bufs[i] 1abecf

364 ptrs[i] = as_cu(buf._h_ptr) 1abecf

365 sizes[i] = buf._size 1abecf

366 loc_arr[i] = _to_cumemlocation(locs[i]) 1abecf

367 loc_indices[i] = <size_t>i 1abecf

368 with nogil: 1abecf

369 HANDLE_RETURN(fn( 1abecf

370 ptrs.data(), sizes.data(), <size_t>n, 

371 loc_arr.data(), loc_indices.data(), <size_t>n, 

372 0, hstream, 

373 )) 

374ELSE: 

375 def _read_preferred_location_v2(Buffer buf) -> Device | Host | None: 

376 # Symbol exists so _managed_buffer.py can `from ... import 

377 # _read_preferred_location_v2` unconditionally at module top. 

378 # `ManagedBuffer.preferred_location` gates on both 

379 # binding_version() and driver_version() >= (13, 0, 0) before 

380 # calling, so this path is unreachable on a cu12 build. 

381 raise NotImplementedError( 

382 "_read_preferred_location_v2 requires a CUDA 13 build of cuda.core" 

383 ) 

384  

385  

386cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): 

387 IF CUDA_CORE_BUILD_MAJOR >= 13: 

388 _do_batch_prefetch_op(bufs, locs, s, cydriver.cuMemPrefetchBatchAsync) 1abcf

389 ELSE: 

390 # cu12 has no cuMemPrefetchBatchAsync; loop per-range. 

391 cdef Buffer buf 

392 cdef Py_ssize_t i 

393 cdef Py_ssize_t n = len(bufs) 

394 for i in range(n): 

395 buf = <Buffer>bufs[i] 

396 _do_single_prefetch(buf, locs[i], s) 

397  

398  

399def discard_prefetch_batch( 

400 stream: Stream | GraphBuilder, 

401 buffers: Sequence[Buffer], 

402 locations: Device | Host | Sequence[Device | Host], 

403) -> None: 

404 """Discard a batch of managed-memory ranges and prefetch them to target locations. 

405  

406 Requires CUDA 13+. For a single buffer, use 

407 :meth:`ManagedBuffer.discard_prefetch` instead. 

408  

409 Parameters 

410 ---------- 

411 stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` 

412 Stream for the asynchronous operation. First positional, required 

413 (mirrors :func:`launch`). 

414 buffers : Sequence[:class:`Buffer`] 

415 Two or more managed allocations to discard and re-prefetch. 

416 locations : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...] 

417 Target location(s). A single location applies to all buffers; 

418 a sequence must match ``len(buffers)``. 

419  

420 Raises 

421 ------ 

422 NotImplementedError 

423 On a CUDA 12 build of ``cuda.core``. 

424 """ 

425 cdef tuple bufs = _coerce_batch_buffers(buffers, "discard_prefetch_batch") 1brx

426 cdef Py_ssize_t n = len(bufs) 1br

427 cdef tuple locs = _broadcast_locations(locations, n, False, "discard_prefetch_batch") 1br

428 cdef Stream s = Stream_accept(stream) 1b

429  

430 cdef Buffer buf 

431 for buf in bufs: 1b

432 _require_managed_buffer(buf, "discard_prefetch_batch") 1b

433  

434 _do_batch_discard_prefetch(bufs, locs, s) 1b

435  

436  

437def _do_single_discard_prefetch_py(Buffer buf, location: Device | Host | None, stream: Stream | GraphBuilder | None) -> None: 

438 """Internal: single-buffer discard+prefetch for 

439 ManagedBuffer.discard_prefetch().""" 

440 _require_managed_buffer(buf, "discard_prefetch") 1ev

441 cdef object loc = _coerce_location(location, allow_none=False) 1e

442 cdef Stream s = Stream_accept(stream) 1e

443 cdef tuple bufs = (buf,) 1e

444 cdef tuple locs = (loc,) 1e

445 _do_batch_discard_prefetch(bufs, locs, s) 1e

446  

447  

448cdef void _do_batch_discard_prefetch(tuple bufs, tuple locs, Stream s): 

449 IF CUDA_CORE_BUILD_MAJOR >= 13: 

450 _do_batch_prefetch_op(bufs, locs, s, cydriver.cuMemDiscardAndPrefetchBatchAsync) 1be

451 ELSE: 

452 raise NotImplementedError( 

453 "discard_prefetch requires a CUDA 13 build of cuda.core" 

454 )