Coverage for cuda/core/_memory/_managed_memory_ops.pyx: 92.22%
180 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-13 01:38 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-13 01:38 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
5from __future__ import annotations
7from collections.abc import Sequence
8from typing import TYPE_CHECKING
10IF CUDA_CORE_BUILD_MAJOR >= 13:
11 from libcpp.vector cimport vector
13from cuda.bindings cimport cydriver
14from cuda.core._memory._buffer cimport Buffer
15from cuda.core._resource_handles cimport as_cu
16from cuda.core._stream cimport Stream, Stream_accept
17from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
19from cuda.core._host import Host
20from cuda.core._utils.cuda_utils import driver
21from cuda.core._memory._managed_location import _coerce_location
23if TYPE_CHECKING:
24 from cuda.core._graph import GraphBuilder
25 from cuda.core._device import Device
27cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current"))
28cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa"))
29cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host"))
31cdef set _ADVICE_IGNORES_LOCATION = {
32 driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY,
33 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_READ_MOSTLY,
34 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
35}
37cdef dict _ADVICE_ALLOWED_LOCTYPES = {
38 driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY: _DEVICE_HOST_NUMA,
39 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_READ_MOSTLY: _DEVICE_HOST_NUMA,
40 driver.CUmem_advise.CU_MEM_ADVISE_SET_PREFERRED_LOCATION: _ALL_LOCATION_TYPES,
41 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: _DEVICE_HOST_NUMA,
42 driver.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY: _DEVICE_HOST_ONLY,
43 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_ACCESSED_BY: _DEVICE_HOST_ONLY,
44}
47cdef void _require_managed_buffer(Buffer self, str what):
48 # Buffer.is_managed handles both pointer-attribute and memory-resource
49 # paths (e.g. pool-allocated managed memory whose pointer attribute
50 # does not advertise CU_POINTER_ATTRIBUTE_IS_MANAGED).
51 if not self.is_managed: 1ablmnkijepqtuhdocfv
52 raise ValueError(f"{what} requires a managed-memory allocation") 1v
55cdef tuple _coerce_batch_buffers(object buffers, str what):
56 """Coerce ``buffers`` to a tuple[Buffer, ...]; rejects a single Buffer.
58 For single-buffer operations, use the corresponding ManagedBuffer
59 instance method instead.
60 """
61 cdef Buffer buf
62 cdef list out
63 if isinstance(buffers, Buffer): 1abcfrswxy
64 raise TypeError( 1wxy
65 f"{what}: pass a sequence of Buffers; for a single buffer use " 1wxy
66 f"the ManagedBuffer instance method"
67 )
68 if isinstance(buffers, Sequence): 1abcfrs
69 if not buffers: 1abcfrs
70 raise ValueError(f"{what}: empty buffers sequence")
71 out = [] 1abcfrs
72 for t in buffers: 1abcfrs
73 buf = <Buffer?>t 1abcfrs
74 out.append(buf) 1abcfrs
75 return tuple(out) 1abcfrs
76 raise TypeError(
77 f"{what}: buffers must be a sequence of Buffer, "
78 f"got {type(buffers).__name__}"
79 )
82cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what):
83 cdef object coerced
84 if isinstance(location, Sequence): 1abcfrs
85 if len(location) != n: 1crs
86 raise ValueError( 1rs
87 f"{what}: location length {len(location)} does not match " 1rs
88 f"targets length {n}" 1rs
89 )
90 return tuple(_coerce_location(loc, allow_none=allow_none) for loc in location) 1c
91 coerced = _coerce_location(location, allow_none=allow_none) 1abf
92 return tuple([coerced] * n) 1abf
95IF CUDA_CORE_BUILD_MAJOR >= 13:
96 # Convert a _LocSpec dataclass to a cydriver.CUmemLocation struct.
97 cdef inline cydriver.CUmemLocation _to_cumemlocation(object loc):
98 cdef cydriver.CUmemLocation out
99 cdef str kind = loc.kind 1ablmnkijepqhdcf
100 if kind == "device": 1ablmnkijepqhdcf
101 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE 1ablmnkijepqdcf
102 out.id = <int>loc.id 1ablmnkijepqdcf
103 elif kind == "host": 1bkehdc
104 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST 1bkedc
105 out.id = 0 1bkedc
106 elif kind == "host_numa": 1h
107 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA 1h
108 out.id = <int>loc.id 1h
109 else: # host_numa_current
110 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT
111 out.id = 0
112 return out 1ablmnkijepqhdcf
113ELSE:
114 # CUDA 12 cuMemPrefetchAsync takes a device ordinal (-1 = host).
115 cdef inline int _to_legacy_device(object loc) except? -2:
116 cdef str kind = loc.kind
117 if kind == "device":
118 return <int>loc.id
119 if kind == "host":
120 return -1
121 raise RuntimeError(
122 "Host(numa_id=...) / Host.numa_current() require both cuda-bindings 13.0+ "
123 "and a CUDA 13+ runtime driver; use Host() instead"
124 )
127def discard_batch(stream: Stream | GraphBuilder, buffers: Sequence[Buffer]) -> None:
128 """Discard a batch of managed-memory ranges.
130 Requires CUDA 13+. For a single buffer, use
131 :meth:`ManagedBuffer.discard` instead.
133 Parameters
134 ----------
135 stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
136 Stream for the asynchronous discard. First positional, required
137 (mirrors :func:`launch`).
138 buffers : Sequence[:class:`Buffer`]
139 Two or more managed allocations to discard. Resident pages are
140 released without prefetching new contents; subsequent access is
141 satisfied by lazy migration.
143 Raises
144 ------
145 NotImplementedError
146 On a CUDA 12 build of ``cuda.core``.
147 """
148 cdef tuple bufs = _coerce_batch_buffers(buffers, "discard_batch") 1aw
149 cdef Stream s = Stream_accept(stream) 1a
151 cdef Buffer buf
152 for buf in bufs: 1a
153 _require_managed_buffer(buf, "discard_batch") 1a
155 _do_batch_discard(bufs, s) 1a
158def _do_single_discard_py(Buffer buf, stream: Stream | GraphBuilder | None) -> None:
159 """Internal: single-buffer discard for ManagedBuffer.discard()."""
160 _require_managed_buffer(buf, "discard") 1ij
161 cdef Stream s = Stream_accept(stream) 1ij
162 # No single-range cuMemDiscard exists; route through the batched call
163 # with count=1.
164 cdef tuple bufs = (buf,) 1ij
165 _do_batch_discard(bufs, s) 1ij
168cdef void _do_batch_discard(tuple bufs, Stream s):
169 IF CUDA_CORE_BUILD_MAJOR >= 13:
170 cdef Py_ssize_t n = len(bufs) 1aij
171 cdef cydriver.CUstream hstream = as_cu(s._h_stream) 1aij
172 cdef vector[cydriver.CUdeviceptr] ptrs
173 cdef vector[size_t] sizes
174 ptrs.resize(n) 1aij
175 sizes.resize(n) 1aij
176 cdef Buffer buf
177 cdef Py_ssize_t i
178 for i in range(n): 1aij
179 buf = <Buffer>bufs[i] 1aij
180 ptrs[i] = as_cu(buf._h_ptr) 1aij
181 sizes[i] = buf._size 1aij
182 with nogil: 1aij
183 HANDLE_RETURN(cydriver.cuMemDiscardBatchAsync( 1aij
184 ptrs.data(), sizes.data(), <size_t>n, 0, hstream,
185 ))
186 ELSE:
187 raise NotImplementedError(
188 "discard requires a CUDA 13 build of cuda.core"
189 )
192def _advise_one(Buffer buf, advice: driver.CUmem_advise, location: Device | Host | None) -> None:
193 """Internal: apply managed-memory advice to a single buffer.
195 Used by :class:`ManagedBuffer` property setters. Not part of the
196 public API.
197 """
198 _require_managed_buffer(buf, "advise") 1lmnktuhdov
199 if not isinstance(advice, driver.CUmem_advise): 1lmnktuhdo
200 raise TypeError(
201 f"advice must be a cuda.bindings.driver.CUmem_advise value, "
202 f"got {type(advice).__name__}"
203 )
204 cdef frozenset allowed_kinds = _ADVICE_ALLOWED_LOCTYPES.get(advice) 1lmnktuhdo
205 if allowed_kinds is None: 1lmnktuhdo
206 raise ValueError(f"Unsupported advice value: {advice!r}")
207 cdef bint allow_none = advice in _ADVICE_IGNORES_LOCATION 1lmnktuhdo
208 cdef object loc = _coerce_location(location, allow_none=allow_none) 1lmnktuhdo
209 if loc is not None and loc.kind not in allowed_kinds: 1lmnktuhdo
210 raise ValueError( 1ktu
211 f"advise {advice.name} does not support location_type='{loc.kind}'" 1ktu
212 )
213 _do_single_advise(buf, advice, loc, allow_none) 1lmnkhdo
216cdef void _do_single_advise(Buffer buf, object advice_value, object loc, bint allow_none):
217 cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) 1lmnkhdo
218 cdef size_t nbytes = buf._size 1lmnkhdo
219 cdef cydriver.CUmem_advise advice_enum = <cydriver.CUmem_advise>(<int>int(advice_value)) 1lmnkhdo
220 IF CUDA_CORE_BUILD_MAJOR >= 13:
221 cdef cydriver.CUmemLocation cu_loc
222 if loc is None: 1lmnkhdo
223 # Driver ignores location for read_mostly / unset_preferred_location
224 # advice values but still validates the CUmemLocation; pass a
225 # host placeholder.
226 cu_loc.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST 1kdo
227 cu_loc.id = 0 1kdo
228 else:
229 cu_loc = _to_cumemlocation(loc) 1lmnkhd
230 with nogil: 1lmnkhdo
231 HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, cu_loc)) 1lmnkhdo
232 ELSE:
233 cdef int dev_int = -1 if loc is None else _to_legacy_device(loc)
234 with nogil:
235 HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, dev_int))
238def prefetch_batch(
239 stream: Stream | GraphBuilder,
240 buffers: Sequence[Buffer],
241 locations: Device | Host | Sequence[Device | Host],
242) -> None:
243 """Prefetch a batch of managed-memory ranges to target locations.
245 Requires CUDA 13+. For a single buffer, use
246 :meth:`ManagedBuffer.prefetch` instead.
248 Parameters
249 ----------
250 stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
251 Stream for the asynchronous prefetch. First positional, required
252 (mirrors :func:`launch`).
253 buffers : Sequence[:class:`Buffer`]
254 Two or more managed allocations to operate on.
255 locations : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...]
256 Target location(s). A single location applies to all buffers; a
257 sequence must match ``len(buffers)``.
259 Notes
260 -----
261 On a CUDA 12 build, falls back to a Python-level loop calling
262 ``cuMemPrefetchAsync`` per buffer (no batched driver entry point on
263 CUDA 12). CUDA 13 builds use ``cuMemPrefetchBatchAsync`` directly.
264 """
265 cdef tuple bufs = _coerce_batch_buffers(buffers, "prefetch_batch") 1abcfsy
266 cdef Py_ssize_t n = len(bufs) 1abcfs
267 cdef tuple locs = _broadcast_locations(locations, n, False, "prefetch_batch") 1abcfs
268 cdef Stream s = Stream_accept(stream) 1abcf
270 cdef Buffer buf
271 for buf in bufs: 1abcf
272 _require_managed_buffer(buf, "prefetch_batch") 1abcf
274 _do_batch_prefetch(bufs, locs, s) 1abcf
277def _do_single_prefetch_py(Buffer buf, location: Device | Host | None, stream: Stream | GraphBuilder | None) -> None:
278 """Internal: single-buffer prefetch for ManagedBuffer.prefetch().
280 Uses cuMemPrefetchAsync (works on CUDA 12 and 13).
281 """
282 _require_managed_buffer(buf, "prefetch") 1ijepqtuv
283 cdef object loc = _coerce_location(location, allow_none=False) 1ijepqtu
284 cdef Stream s = Stream_accept(stream) 1ijepq
285 _do_single_prefetch(buf, loc, s) 1ijepq
288cdef void _do_single_prefetch(Buffer buf, object loc, Stream s):
289 cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) 1ijepq
290 cdef size_t nbytes = buf._size 1ijepq
291 cdef cydriver.CUstream hstream = as_cu(s._h_stream) 1ijepq
292 IF CUDA_CORE_BUILD_MAJOR >= 13:
293 cdef cydriver.CUmemLocation cu_loc = _to_cumemlocation(loc) 1ijepq
294 with nogil: 1ijepq
295 HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, cu_loc, 0, hstream)) 1ijepq
296 ELSE:
297 cdef int dev_int = _to_legacy_device(loc)
298 with nogil:
299 HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, dev_int, hstream))
302IF CUDA_CORE_BUILD_MAJOR >= 13:
303 # Function-pointer type for cuMemPrefetchBatchAsync /
304 # cuMemDiscardAndPrefetchBatchAsync; both have identical signatures.
305 ctypedef cydriver.CUresult (*_BatchPrefetchFn)(
306 cydriver.CUdeviceptr*, size_t*, size_t,
307 cydriver.CUmemLocation*, size_t*, size_t,
308 unsigned long long, cydriver.CUstream,
309 ) except ?cydriver.CUDA_ERROR_NOT_FOUND nogil
312 def _read_preferred_location_v2(Buffer buf) -> Device | Host | None:
313 """Internal: read preferred_location with full NUMA detail.
315 Bypasses cuda.bindings.driver.cuMemRangeGetAttribute (whose
316 attribute allowlist doesn't yet include the cu13 _TYPE / _ID
317 attributes) by calling cydriver directly.
319 Returns Device | Host | None.
320 """
321 cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) 1hd
322 cdef size_t nbytes = buf._size 1hd
323 cdef int loc_type = 0 1hd
324 cdef int loc_id = 0 1hd
325 with nogil: 1hd
326 HANDLE_RETURN(cydriver.cuMemRangeGetAttribute( 1hd
327 <void*>&loc_type, sizeof(int),
328 cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE,
329 cu_ptr, nbytes,
330 ))
331 HANDLE_RETURN(cydriver.cuMemRangeGetAttribute( 1hd
332 <void*>&loc_id, sizeof(int),
333 cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID,
334 cu_ptr, nbytes,
335 ))
336 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: 1hd
337 from cuda.core._device import Device 1d
338 return Device(loc_id) 1d
339 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST: 1hd
340 return Host() 1d
341 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA: 1hd
342 return Host(numa_id=loc_id)
343 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: 1hd
344 return Host.numa_current()
345 return None # CU_MEM_LOCATION_TYPE_INVALID — no preferred location 1hd
348 cdef void _do_batch_prefetch_op(tuple bufs, tuple locs, Stream s, _BatchPrefetchFn fn):
349 """Shared body for batched prefetch / discard-and-prefetch."""
350 cdef Py_ssize_t n = len(bufs) 1abecf
351 cdef cydriver.CUstream hstream = as_cu(s._h_stream) 1abecf
352 cdef vector[cydriver.CUdeviceptr] ptrs
353 cdef vector[size_t] sizes
354 cdef vector[cydriver.CUmemLocation] loc_arr
355 cdef vector[size_t] loc_indices
356 ptrs.resize(n) 1abecf
357 sizes.resize(n) 1abecf
358 loc_arr.resize(n) 1abecf
359 loc_indices.resize(n) 1abecf
360 cdef Buffer buf
361 cdef Py_ssize_t i
362 for i in range(n): 1abecf
363 buf = <Buffer>bufs[i] 1abecf
364 ptrs[i] = as_cu(buf._h_ptr) 1abecf
365 sizes[i] = buf._size 1abecf
366 loc_arr[i] = _to_cumemlocation(locs[i]) 1abecf
367 loc_indices[i] = <size_t>i 1abecf
368 with nogil: 1abecf
369 HANDLE_RETURN(fn( 1abecf
370 ptrs.data(), sizes.data(), <size_t>n,
371 loc_arr.data(), loc_indices.data(), <size_t>n,
372 0, hstream,
373 ))
374ELSE:
375 def _read_preferred_location_v2(Buffer buf) -> Device | Host | None:
376 # Symbol exists so _managed_buffer.py can `from ... import
377 # _read_preferred_location_v2` unconditionally at module top.
378 # `ManagedBuffer.preferred_location` gates on both
379 # binding_version() and driver_version() >= (13, 0, 0) before
380 # calling, so this path is unreachable on a cu12 build.
381 raise NotImplementedError(
382 "_read_preferred_location_v2 requires a CUDA 13 build of cuda.core"
383 )
386cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s):
387 IF CUDA_CORE_BUILD_MAJOR >= 13:
388 _do_batch_prefetch_op(bufs, locs, s, cydriver.cuMemPrefetchBatchAsync) 1abcf
389 ELSE:
390 # cu12 has no cuMemPrefetchBatchAsync; loop per-range.
391 cdef Buffer buf
392 cdef Py_ssize_t i
393 cdef Py_ssize_t n = len(bufs)
394 for i in range(n):
395 buf = <Buffer>bufs[i]
396 _do_single_prefetch(buf, locs[i], s)
399def discard_prefetch_batch(
400 stream: Stream | GraphBuilder,
401 buffers: Sequence[Buffer],
402 locations: Device | Host | Sequence[Device | Host],
403) -> None:
404 """Discard a batch of managed-memory ranges and prefetch them to target locations.
406 Requires CUDA 13+. For a single buffer, use
407 :meth:`ManagedBuffer.discard_prefetch` instead.
409 Parameters
410 ----------
411 stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
412 Stream for the asynchronous operation. First positional, required
413 (mirrors :func:`launch`).
414 buffers : Sequence[:class:`Buffer`]
415 Two or more managed allocations to discard and re-prefetch.
416 locations : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...]
417 Target location(s). A single location applies to all buffers;
418 a sequence must match ``len(buffers)``.
420 Raises
421 ------
422 NotImplementedError
423 On a CUDA 12 build of ``cuda.core``.
424 """
425 cdef tuple bufs = _coerce_batch_buffers(buffers, "discard_prefetch_batch") 1brx
426 cdef Py_ssize_t n = len(bufs) 1br
427 cdef tuple locs = _broadcast_locations(locations, n, False, "discard_prefetch_batch") 1br
428 cdef Stream s = Stream_accept(stream) 1b
430 cdef Buffer buf
431 for buf in bufs: 1b
432 _require_managed_buffer(buf, "discard_prefetch_batch") 1b
434 _do_batch_discard_prefetch(bufs, locs, s) 1b
437def _do_single_discard_prefetch_py(Buffer buf, location: Device | Host | None, stream: Stream | GraphBuilder | None) -> None:
438 """Internal: single-buffer discard+prefetch for
439 ManagedBuffer.discard_prefetch()."""
440 _require_managed_buffer(buf, "discard_prefetch") 1ev
441 cdef object loc = _coerce_location(location, allow_none=False) 1e
442 cdef Stream s = Stream_accept(stream) 1e
443 cdef tuple bufs = (buf,) 1e
444 cdef tuple locs = (loc,) 1e
445 _do_batch_discard_prefetch(bufs, locs, s) 1e
448cdef void _do_batch_discard_prefetch(tuple bufs, tuple locs, Stream s):
449 IF CUDA_CORE_BUILD_MAJOR >= 13:
450 _do_batch_prefetch_op(bufs, locs, s, cydriver.cuMemDiscardAndPrefetchBatchAsync) 1be
451 ELSE:
452 raise NotImplementedError(
453 "discard_prefetch requires a CUDA 13 build of cuda.core"
454 )