Coverage for cuda / core / _memory / _managed_memory_ops.pyx: 93.22%

177 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-22 01:37 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from __future__ import annotations 

6  

7from collections.abc import Sequence 

8  

9IF CUDA_CORE_BUILD_MAJOR >= 13: 

10 from libcpp.vector cimport vector 

11  

12from cuda.bindings cimport cydriver 

13from cuda.core._memory._buffer cimport Buffer 

14from cuda.core._resource_handles cimport as_cu 

15from cuda.core._stream cimport Stream, Stream_accept 

16from cuda.core._utils.cuda_utils cimport HANDLE_RETURN 

17  

18from cuda.core._host import Host 

19from cuda.core._utils.cuda_utils import driver 

20from cuda.core._memory._managed_location import _coerce_location 

21  

22  

23cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) 

24cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) 

25cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) 

26  

27cdef set _ADVICE_IGNORES_LOCATION = { 

28 driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY, 

29 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_READ_MOSTLY, 

30 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION, 

31} 

32  

33cdef dict _ADVICE_ALLOWED_LOCTYPES = { 

34 driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY: _DEVICE_HOST_NUMA, 

35 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_READ_MOSTLY: _DEVICE_HOST_NUMA, 

36 driver.CUmem_advise.CU_MEM_ADVISE_SET_PREFERRED_LOCATION: _ALL_LOCATION_TYPES, 

37 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: _DEVICE_HOST_NUMA, 

38 driver.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY: _DEVICE_HOST_ONLY, 

39 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_ACCESSED_BY: _DEVICE_HOST_ONLY, 

40} 

41  

42  

43cdef void _require_managed_buffer(Buffer self, str what): 

44 # Buffer.is_managed handles both pointer-attribute and memory-resource 

45 # paths (e.g. pool-allocated managed memory whose pointer attribute 

46 # does not advertise CU_POINTER_ATTRIBUTE_IS_MANAGED). 

47 if not self.is_managed: 1gablmnkijepqtuhdocfv

48 raise ValueError(f"{what} requires a managed-memory allocation") 1v

49  

50  

51cdef tuple _coerce_batch_buffers(object buffers, str what): 

52 """Coerce ``buffers`` to a tuple[Buffer, ...]; rejects a single Buffer. 

53  

54 For single-buffer operations, use the corresponding ManagedBuffer 

55 instance method instead. 

56 """ 

57 cdef Buffer buf 

58 cdef list out 

59 if isinstance(buffers, Buffer): 1abcfrswxy

60 raise TypeError( 1wxy

61 f"{what}: pass a sequence of Buffers; for a single buffer use " 1wxy

62 f"the ManagedBuffer instance method" 

63 ) 

64 if isinstance(buffers, Sequence): 1abcfrs

65 if not buffers: 1abcfrs

66 raise ValueError(f"{what}: empty buffers sequence") 

67 out = [] 1abcfrs

68 for t in buffers: 1abcfrs

69 buf = <Buffer?>t 1abcfrs

70 out.append(buf) 1abcfrs

71 return tuple(out) 1abcfrs

72 raise TypeError( 

73 f"{what}: buffers must be a sequence of Buffer, " 

74 f"got {type(buffers).__name__}" 

75 ) 

76  

77  

78cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what): 

79 cdef object coerced 

80 if isinstance(location, Sequence): 1abcfrs

81 if len(location) != n: 1crs

82 raise ValueError( 1rs

83 f"{what}: location length {len(location)} does not match " 1rs

84 f"targets length {n}" 1rs

85 ) 

86 return tuple(_coerce_location(loc, allow_none=allow_none) for loc in location) 1c

87 coerced = _coerce_location(location, allow_none=allow_none) 1abf

88 return tuple([coerced] * n) 1abf

89  

90  

91IF CUDA_CORE_BUILD_MAJOR >= 13: 

92 # Convert a _LocSpec dataclass to a cydriver.CUmemLocation struct. 

93 cdef inline cydriver.CUmemLocation _to_cumemlocation(object loc): 

94 cdef cydriver.CUmemLocation out 

95 cdef str kind = loc.kind 1ablmnkijepqhdcf

96 if kind == "device": 1ablmnkijepqhdcf

97 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE 1ablmnkijepqdcf

98 out.id = <int>loc.id 1ablmnkijepqdcf

99 elif kind == "host": 1bkehdc

100 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST 1bkedc

101 out.id = 0 1bkedc

102 elif kind == "host_numa": 1h

103 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA 1h

104 out.id = <int>loc.id 1h

105 else: # host_numa_current 

106 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT 

107 out.id = 0 

108 return out 1ablmnkijepqhdcf

109ELSE: 

110 # CUDA 12 cuMemPrefetchAsync takes a device ordinal (-1 = host). 

111 cdef inline int _to_legacy_device(object loc) except? -2: 

112 cdef str kind = loc.kind 

113 if kind == "device": 

114 return <int>loc.id 

115 if kind == "host": 

116 return -1 

117 raise RuntimeError( 

118 "Host(numa_id=...) / Host.numa_current() require both cuda-bindings 13.0+ " 

119 "and a CUDA 13+ runtime driver; use Host() instead" 

120 ) 

121  

122  

123def discard_batch(stream: Stream | GraphBuilder, buffers: Sequence[Buffer]) -> None: 

124 """Discard a batch of managed-memory ranges. 

125  

126 Requires CUDA 13+. For a single buffer, use 

127 :meth:`ManagedBuffer.discard` instead. 

128  

129 Parameters 

130 ---------- 

131 stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` 

132 Stream for the asynchronous discard. First positional, required 

133 (mirrors :func:`launch`). 

134 buffers : Sequence[:class:`Buffer`] 

135 Two or more managed allocations to discard. Resident pages are 

136 released without prefetching new contents; subsequent access is 

137 satisfied by lazy migration. 

138  

139 Raises 

140 ------ 

141 NotImplementedError 

142 On a CUDA 12 build of ``cuda.core``. 

143 """ 

144 cdef tuple bufs = _coerce_batch_buffers(buffers, "discard_batch") 1aw

145 cdef Stream s = Stream_accept(stream) 1a

146  

147 cdef Buffer buf 

148 for buf in bufs: 1a

149 _require_managed_buffer(buf, "discard_batch") 1a

150  

151 _do_batch_discard(bufs, s) 1a

152  

153  

154def _do_single_discard_py(Buffer buf, stream): 

155 """Internal: single-buffer discard for ManagedBuffer.discard().""" 

156 _require_managed_buffer(buf, "discard") 1ij

157 cdef Stream s = Stream_accept(stream) 1ij

158 # No single-range cuMemDiscard exists; route through the batched call 

159 # with count=1. 

160 cdef tuple bufs = (buf,) 1ij

161 _do_batch_discard(bufs, s) 1ij

162  

163  

164cdef void _do_batch_discard(tuple bufs, Stream s): 

165 IF CUDA_CORE_BUILD_MAJOR >= 13: 

166 cdef Py_ssize_t n = len(bufs) 1aij

167 cdef cydriver.CUstream hstream = as_cu(s._h_stream) 1aij

168 cdef vector[cydriver.CUdeviceptr] ptrs 

169 cdef vector[size_t] sizes 

170 ptrs.resize(n) 1aij

171 sizes.resize(n) 1aij

172 cdef Buffer buf 

173 cdef Py_ssize_t i 

174 for i in range(n): 1aij

175 buf = <Buffer>bufs[i] 1aij

176 ptrs[i] = as_cu(buf._h_ptr) 1aij

177 sizes[i] = buf._size 1aij

178 with nogil: 1aij

179 HANDLE_RETURN(cydriver.cuMemDiscardBatchAsync( 1aij

180 ptrs.data(), sizes.data(), <size_t>n, 0, hstream, 

181 )) 

182 ELSE: 

183 raise NotImplementedError( 

184 "discard requires a CUDA 13 build of cuda.core" 

185 ) 

186  

187  

188def _advise_one(Buffer buf, advice, location): 

189 """Internal: apply managed-memory advice to a single buffer. 

190  

191 Used by :class:`ManagedBuffer` property setters. Not part of the 

192 public API. 

193 """ 

194 _require_managed_buffer(buf, "advise") 1lmnktuhdov

195 if not isinstance(advice, driver.CUmem_advise): 1lmnktuhdo

196 raise TypeError( 

197 f"advice must be a cuda.bindings.driver.CUmem_advise value, " 

198 f"got {type(advice).__name__}" 

199 ) 

200 cdef frozenset allowed_kinds = _ADVICE_ALLOWED_LOCTYPES.get(advice) 1lmnktuhdo

201 if allowed_kinds is None: 1lmnktuhdo

202 raise ValueError(f"Unsupported advice value: {advice!r}") 

203 cdef bint allow_none = advice in _ADVICE_IGNORES_LOCATION 1lmnktuhdo

204 cdef object loc = _coerce_location(location, allow_none=allow_none) 1lmnktuhdo

205 if loc is not None and loc.kind not in allowed_kinds: 1lmnktuhdo

206 raise ValueError( 1ktu

207 f"advise {advice.name} does not support location_type='{loc.kind}'" 1ktu

208 ) 

209 _do_single_advise(buf, advice, loc, allow_none) 1lmnkhdo

210  

211  

212cdef void _do_single_advise(Buffer buf, object advice_value, object loc, bint allow_none): 

213 cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) 1lmnkhdo

214 cdef size_t nbytes = buf._size 1lmnkhdo

215 cdef cydriver.CUmem_advise advice_enum = <cydriver.CUmem_advise>(<int>int(advice_value)) 1lmnkhdo

216 IF CUDA_CORE_BUILD_MAJOR >= 13: 

217 cdef cydriver.CUmemLocation cu_loc 

218 if loc is None: 1lmnkhdo

219 # Driver ignores location for read_mostly / unset_preferred_location 

220 # advice values but still validates the CUmemLocation; pass a 

221 # host placeholder. 

222 cu_loc.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST 1kdo

223 cu_loc.id = 0 1kdo

224 else: 

225 cu_loc = _to_cumemlocation(loc) 1lmnkhd

226 with nogil: 1lmnkhdo

227 HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, cu_loc)) 1lmnkhdo

228 ELSE: 

229 cdef int dev_int = -1 if loc is None else _to_legacy_device(loc) 

230 with nogil: 

231 HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, dev_int)) 

232  

233  

234def prefetch_batch( 

235 stream: Stream | GraphBuilder, 

236 buffers: Sequence[Buffer], 

237 locations: Device | Host | Sequence[Device | Host], 

238) -> None: 

239 """Prefetch a batch of managed-memory ranges to target locations. 

240  

241 Requires CUDA 13+. For a single buffer, use 

242 :meth:`ManagedBuffer.prefetch` instead. 

243  

244 Parameters 

245 ---------- 

246 stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` 

247 Stream for the asynchronous prefetch. First positional, required 

248 (mirrors :func:`launch`). 

249 buffers : Sequence[:class:`Buffer`] 

250 Two or more managed allocations to operate on. 

251 locations : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...] 

252 Target location(s). A single location applies to all buffers; a 

253 sequence must match ``len(buffers)``. 

254  

255 Notes 

256 ----- 

257 On a CUDA 12 build, falls back to a Python-level loop calling 

258 ``cuMemPrefetchAsync`` per buffer (no batched driver entry point on 

259 CUDA 12). CUDA 13 builds use ``cuMemPrefetchBatchAsync`` directly. 

260 """ 

261 cdef tuple bufs = _coerce_batch_buffers(buffers, "prefetch_batch") 1abcfsy

262 cdef Py_ssize_t n = len(bufs) 1abcfs

263 cdef tuple locs = _broadcast_locations(locations, n, False, "prefetch_batch") 1abcfs

264 cdef Stream s = Stream_accept(stream) 1abcf

265  

266 cdef Buffer buf 

267 for buf in bufs: 1abcf

268 _require_managed_buffer(buf, "prefetch_batch") 1abcf

269  

270 _do_batch_prefetch(bufs, locs, s) 1abcf

271  

272  

273def _do_single_prefetch_py(Buffer buf, location, stream): 

274 """Internal: single-buffer prefetch for ManagedBuffer.prefetch(). 

275  

276 Uses cuMemPrefetchAsync (works on CUDA 12 and 13). 

277 """ 

278 _require_managed_buffer(buf, "prefetch") 1ijepqtuv

279 cdef object loc = _coerce_location(location, allow_none=False) 1ijepqtu

280 cdef Stream s = Stream_accept(stream) 1ijepq

281 _do_single_prefetch(buf, loc, s) 1ijepq

282  

283  

284cdef void _do_single_prefetch(Buffer buf, object loc, Stream s): 

285 cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) 1ijepq

286 cdef size_t nbytes = buf._size 1ijepq

287 cdef cydriver.CUstream hstream = as_cu(s._h_stream) 1ijepq

288 IF CUDA_CORE_BUILD_MAJOR >= 13: 

289 cdef cydriver.CUmemLocation cu_loc = _to_cumemlocation(loc) 1ijepq

290 with nogil: 1ijepq

291 HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, cu_loc, 0, hstream)) 1ijepq

292 ELSE: 

293 cdef int dev_int = _to_legacy_device(loc) 

294 with nogil: 

295 HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, dev_int, hstream)) 

296  

297  

298IF CUDA_CORE_BUILD_MAJOR >= 13: 

299 # Function-pointer type for cuMemPrefetchBatchAsync / 

300 # cuMemDiscardAndPrefetchBatchAsync; both have identical signatures. 

301 ctypedef cydriver.CUresult (*_BatchPrefetchFn)( 

302 cydriver.CUdeviceptr*, size_t*, size_t, 

303 cydriver.CUmemLocation*, size_t*, size_t, 

304 unsigned long long, cydriver.CUstream, 

305 ) except ?cydriver.CUDA_ERROR_NOT_FOUND nogil 

306  

307  

308 def _read_preferred_location_v2(Buffer buf): 

309 """Internal: read preferred_location with full NUMA detail. 

310  

311 Bypasses cuda.bindings.driver.cuMemRangeGetAttribute (whose 

312 attribute allowlist doesn't yet include the cu13 _TYPE / _ID 

313 attributes) by calling cydriver directly. 

314  

315 Returns Device | Host | None. 

316 """ 

317 cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) 1hd

318 cdef size_t nbytes = buf._size 1hd

319 cdef int loc_type = 0 1hd

320 cdef int loc_id = 0 1hd

321 with nogil: 1hd

322 HANDLE_RETURN(cydriver.cuMemRangeGetAttribute( 1hd

323 <void*>&loc_type, sizeof(int), 

324 cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE, 

325 cu_ptr, nbytes, 

326 )) 

327 HANDLE_RETURN(cydriver.cuMemRangeGetAttribute( 1hd

328 <void*>&loc_id, sizeof(int), 

329 cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID, 

330 cu_ptr, nbytes, 

331 )) 

332 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: 1hd

333 from cuda.core._device import Device 1d

334 return Device(loc_id) 1d

335 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST: 1hd

336 return Host() 1d

337 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA: 1hd

338 return Host(numa_id=loc_id) 

339 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: 1hd

340 return Host.numa_current() 

341 return None # CU_MEM_LOCATION_TYPE_INVALID — no preferred location 1hd

342  

343  

344 cdef void _do_batch_prefetch_op(tuple bufs, tuple locs, Stream s, _BatchPrefetchFn fn): 

345 """Shared body for batched prefetch / discard-and-prefetch.""" 

346 cdef Py_ssize_t n = len(bufs) 1abecf

347 cdef cydriver.CUstream hstream = as_cu(s._h_stream) 1abecf

348 cdef vector[cydriver.CUdeviceptr] ptrs 

349 cdef vector[size_t] sizes 

350 cdef vector[cydriver.CUmemLocation] loc_arr 

351 cdef vector[size_t] loc_indices 

352 ptrs.resize(n) 1abecf

353 sizes.resize(n) 1abecf

354 loc_arr.resize(n) 1abecf

355 loc_indices.resize(n) 1abecf

356 cdef Buffer buf 

357 cdef Py_ssize_t i 

358 for i in range(n): 1abecf

359 buf = <Buffer>bufs[i] 1abecf

360 ptrs[i] = as_cu(buf._h_ptr) 1abecf

361 sizes[i] = buf._size 1abecf

362 loc_arr[i] = _to_cumemlocation(locs[i]) 1abecf

363 loc_indices[i] = <size_t>i 1abecf

364 with nogil: 1abecf

365 HANDLE_RETURN(fn( 1abecf

366 ptrs.data(), sizes.data(), <size_t>n, 

367 loc_arr.data(), loc_indices.data(), <size_t>n, 

368 0, hstream, 

369 )) 

370ELSE: 

371 def _read_preferred_location_v2(Buffer buf): 

372 # Symbol exists so _managed_buffer.py can `from ... import 

373 # _read_preferred_location_v2` unconditionally at module top. 

374 # `ManagedBuffer.preferred_location` gates on both 

375 # binding_version() and driver_version() >= (13, 0, 0) before 

376 # calling, so this path is unreachable on a cu12 build. 

377 raise NotImplementedError( 

378 "_read_preferred_location_v2 requires a CUDA 13 build of cuda.core" 

379 ) 

380  

381  

382cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): 

383 IF CUDA_CORE_BUILD_MAJOR >= 13: 

384 _do_batch_prefetch_op(bufs, locs, s, cydriver.cuMemPrefetchBatchAsync) 1abcf

385 ELSE: 

386 # cu12 has no cuMemPrefetchBatchAsync; loop per-range. 

387 cdef Buffer buf 

388 cdef Py_ssize_t i 

389 cdef Py_ssize_t n = len(bufs) 

390 for i in range(n): 

391 buf = <Buffer>bufs[i] 

392 _do_single_prefetch(buf, locs[i], s) 

393  

394  

395def discard_prefetch_batch( 

396 stream: Stream | GraphBuilder, 

397 buffers: Sequence[Buffer], 

398 locations: Device | Host | Sequence[Device | Host], 

399) -> None: 

400 """Discard a batch of managed-memory ranges and prefetch them to target locations. 

401  

402 Requires CUDA 13+. For a single buffer, use 

403 :meth:`ManagedBuffer.discard_prefetch` instead. 

404  

405 Parameters 

406 ---------- 

407 stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` 

408 Stream for the asynchronous operation. First positional, required 

409 (mirrors :func:`launch`). 

410 buffers : Sequence[:class:`Buffer`] 

411 Two or more managed allocations to discard and re-prefetch. 

412 locations : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...] 

413 Target location(s). A single location applies to all buffers; 

414 a sequence must match ``len(buffers)``. 

415  

416 Raises 

417 ------ 

418 NotImplementedError 

419 On a CUDA 12 build of ``cuda.core``. 

420 """ 

421 cdef tuple bufs = _coerce_batch_buffers(buffers, "discard_prefetch_batch") 1brx

422 cdef Py_ssize_t n = len(bufs) 1br

423 cdef tuple locs = _broadcast_locations(locations, n, False, "discard_prefetch_batch") 1br

424 cdef Stream s = Stream_accept(stream) 1b

425  

426 cdef Buffer buf 

427 for buf in bufs: 1b

428 _require_managed_buffer(buf, "discard_prefetch_batch") 1b

429  

430 _do_batch_discard_prefetch(bufs, locs, s) 1b

431  

432  

433def _do_single_discard_prefetch_py(Buffer buf, location, stream): 

434 """Internal: single-buffer discard+prefetch for 

435 ManagedBuffer.discard_prefetch().""" 

436 _require_managed_buffer(buf, "discard_prefetch") 1ev

437 cdef object loc = _coerce_location(location, allow_none=False) 1e

438 cdef Stream s = Stream_accept(stream) 1e

439 cdef tuple bufs = (buf,) 1e

440 cdef tuple locs = (loc,) 1e

441 _do_batch_discard_prefetch(bufs, locs, s) 1e

442  

443  

444cdef void _do_batch_discard_prefetch(tuple bufs, tuple locs, Stream s): 

445 IF CUDA_CORE_BUILD_MAJOR >= 13: 

446 _do_batch_prefetch_op(bufs, locs, s, cydriver.cuMemDiscardAndPrefetchBatchAsync) 1be

447 ELSE: 

448 raise NotImplementedError( 

449 "discard_prefetch requires a CUDA 13 build of cuda.core" 

450 )