Coverage for cuda / core / _memory / _managed_memory_ops.pyx: 93.22%
177 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-22 01:37 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-22 01:37 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
5from __future__ import annotations
7from collections.abc import Sequence
9IF CUDA_CORE_BUILD_MAJOR >= 13:
10 from libcpp.vector cimport vector
12from cuda.bindings cimport cydriver
13from cuda.core._memory._buffer cimport Buffer
14from cuda.core._resource_handles cimport as_cu
15from cuda.core._stream cimport Stream, Stream_accept
16from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
18from cuda.core._host import Host
19from cuda.core._utils.cuda_utils import driver
20from cuda.core._memory._managed_location import _coerce_location
23cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current"))
24cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa"))
25cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host"))
27cdef set _ADVICE_IGNORES_LOCATION = {
28 driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY,
29 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_READ_MOSTLY,
30 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
31}
33cdef dict _ADVICE_ALLOWED_LOCTYPES = {
34 driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY: _DEVICE_HOST_NUMA,
35 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_READ_MOSTLY: _DEVICE_HOST_NUMA,
36 driver.CUmem_advise.CU_MEM_ADVISE_SET_PREFERRED_LOCATION: _ALL_LOCATION_TYPES,
37 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: _DEVICE_HOST_NUMA,
38 driver.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY: _DEVICE_HOST_ONLY,
39 driver.CUmem_advise.CU_MEM_ADVISE_UNSET_ACCESSED_BY: _DEVICE_HOST_ONLY,
40}
43cdef void _require_managed_buffer(Buffer self, str what):
44 # Buffer.is_managed handles both pointer-attribute and memory-resource
45 # paths (e.g. pool-allocated managed memory whose pointer attribute
46 # does not advertise CU_POINTER_ATTRIBUTE_IS_MANAGED).
47 if not self.is_managed: 1gablmnkijepqtuhdocfv
48 raise ValueError(f"{what} requires a managed-memory allocation") 1v
51cdef tuple _coerce_batch_buffers(object buffers, str what):
52 """Coerce ``buffers`` to a tuple[Buffer, ...]; rejects a single Buffer.
54 For single-buffer operations, use the corresponding ManagedBuffer
55 instance method instead.
56 """
57 cdef Buffer buf
58 cdef list out
59 if isinstance(buffers, Buffer): 1abcfrswxy
60 raise TypeError( 1wxy
61 f"{what}: pass a sequence of Buffers; for a single buffer use " 1wxy
62 f"the ManagedBuffer instance method"
63 )
64 if isinstance(buffers, Sequence): 1abcfrs
65 if not buffers: 1abcfrs
66 raise ValueError(f"{what}: empty buffers sequence")
67 out = [] 1abcfrs
68 for t in buffers: 1abcfrs
69 buf = <Buffer?>t 1abcfrs
70 out.append(buf) 1abcfrs
71 return tuple(out) 1abcfrs
72 raise TypeError(
73 f"{what}: buffers must be a sequence of Buffer, "
74 f"got {type(buffers).__name__}"
75 )
78cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what):
79 cdef object coerced
80 if isinstance(location, Sequence): 1abcfrs
81 if len(location) != n: 1crs
82 raise ValueError( 1rs
83 f"{what}: location length {len(location)} does not match " 1rs
84 f"targets length {n}" 1rs
85 )
86 return tuple(_coerce_location(loc, allow_none=allow_none) for loc in location) 1c
87 coerced = _coerce_location(location, allow_none=allow_none) 1abf
88 return tuple([coerced] * n) 1abf
91IF CUDA_CORE_BUILD_MAJOR >= 13:
92 # Convert a _LocSpec dataclass to a cydriver.CUmemLocation struct.
93 cdef inline cydriver.CUmemLocation _to_cumemlocation(object loc):
94 cdef cydriver.CUmemLocation out
95 cdef str kind = loc.kind 1ablmnkijepqhdcf
96 if kind == "device": 1ablmnkijepqhdcf
97 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE 1ablmnkijepqdcf
98 out.id = <int>loc.id 1ablmnkijepqdcf
99 elif kind == "host": 1bkehdc
100 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST 1bkedc
101 out.id = 0 1bkedc
102 elif kind == "host_numa": 1h
103 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA 1h
104 out.id = <int>loc.id 1h
105 else: # host_numa_current
106 out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT
107 out.id = 0
108 return out 1ablmnkijepqhdcf
109ELSE:
110 # CUDA 12 cuMemPrefetchAsync takes a device ordinal (-1 = host).
111 cdef inline int _to_legacy_device(object loc) except? -2:
112 cdef str kind = loc.kind
113 if kind == "device":
114 return <int>loc.id
115 if kind == "host":
116 return -1
117 raise RuntimeError(
118 "Host(numa_id=...) / Host.numa_current() require both cuda-bindings 13.0+ "
119 "and a CUDA 13+ runtime driver; use Host() instead"
120 )
123def discard_batch(stream: Stream | GraphBuilder, buffers: Sequence[Buffer]) -> None:
124 """Discard a batch of managed-memory ranges.
126 Requires CUDA 13+. For a single buffer, use
127 :meth:`ManagedBuffer.discard` instead.
129 Parameters
130 ----------
131 stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
132 Stream for the asynchronous discard. First positional, required
133 (mirrors :func:`launch`).
134 buffers : Sequence[:class:`Buffer`]
135 Two or more managed allocations to discard. Resident pages are
136 released without prefetching new contents; subsequent access is
137 satisfied by lazy migration.
139 Raises
140 ------
141 NotImplementedError
142 On a CUDA 12 build of ``cuda.core``.
143 """
144 cdef tuple bufs = _coerce_batch_buffers(buffers, "discard_batch") 1aw
145 cdef Stream s = Stream_accept(stream) 1a
147 cdef Buffer buf
148 for buf in bufs: 1a
149 _require_managed_buffer(buf, "discard_batch") 1a
151 _do_batch_discard(bufs, s) 1a
154def _do_single_discard_py(Buffer buf, stream):
155 """Internal: single-buffer discard for ManagedBuffer.discard()."""
156 _require_managed_buffer(buf, "discard") 1ij
157 cdef Stream s = Stream_accept(stream) 1ij
158 # No single-range cuMemDiscard exists; route through the batched call
159 # with count=1.
160 cdef tuple bufs = (buf,) 1ij
161 _do_batch_discard(bufs, s) 1ij
164cdef void _do_batch_discard(tuple bufs, Stream s):
165 IF CUDA_CORE_BUILD_MAJOR >= 13:
166 cdef Py_ssize_t n = len(bufs) 1aij
167 cdef cydriver.CUstream hstream = as_cu(s._h_stream) 1aij
168 cdef vector[cydriver.CUdeviceptr] ptrs
169 cdef vector[size_t] sizes
170 ptrs.resize(n) 1aij
171 sizes.resize(n) 1aij
172 cdef Buffer buf
173 cdef Py_ssize_t i
174 for i in range(n): 1aij
175 buf = <Buffer>bufs[i] 1aij
176 ptrs[i] = as_cu(buf._h_ptr) 1aij
177 sizes[i] = buf._size 1aij
178 with nogil: 1aij
179 HANDLE_RETURN(cydriver.cuMemDiscardBatchAsync( 1aij
180 ptrs.data(), sizes.data(), <size_t>n, 0, hstream,
181 ))
182 ELSE:
183 raise NotImplementedError(
184 "discard requires a CUDA 13 build of cuda.core"
185 )
188def _advise_one(Buffer buf, advice, location):
189 """Internal: apply managed-memory advice to a single buffer.
191 Used by :class:`ManagedBuffer` property setters. Not part of the
192 public API.
193 """
194 _require_managed_buffer(buf, "advise") 1lmnktuhdov
195 if not isinstance(advice, driver.CUmem_advise): 1lmnktuhdo
196 raise TypeError(
197 f"advice must be a cuda.bindings.driver.CUmem_advise value, "
198 f"got {type(advice).__name__}"
199 )
200 cdef frozenset allowed_kinds = _ADVICE_ALLOWED_LOCTYPES.get(advice) 1lmnktuhdo
201 if allowed_kinds is None: 1lmnktuhdo
202 raise ValueError(f"Unsupported advice value: {advice!r}")
203 cdef bint allow_none = advice in _ADVICE_IGNORES_LOCATION 1lmnktuhdo
204 cdef object loc = _coerce_location(location, allow_none=allow_none) 1lmnktuhdo
205 if loc is not None and loc.kind not in allowed_kinds: 1lmnktuhdo
206 raise ValueError( 1ktu
207 f"advise {advice.name} does not support location_type='{loc.kind}'" 1ktu
208 )
209 _do_single_advise(buf, advice, loc, allow_none) 1lmnkhdo
212cdef void _do_single_advise(Buffer buf, object advice_value, object loc, bint allow_none):
213 cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) 1lmnkhdo
214 cdef size_t nbytes = buf._size 1lmnkhdo
215 cdef cydriver.CUmem_advise advice_enum = <cydriver.CUmem_advise>(<int>int(advice_value)) 1lmnkhdo
216 IF CUDA_CORE_BUILD_MAJOR >= 13:
217 cdef cydriver.CUmemLocation cu_loc
218 if loc is None: 1lmnkhdo
219 # Driver ignores location for read_mostly / unset_preferred_location
220 # advice values but still validates the CUmemLocation; pass a
221 # host placeholder.
222 cu_loc.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST 1kdo
223 cu_loc.id = 0 1kdo
224 else:
225 cu_loc = _to_cumemlocation(loc) 1lmnkhd
226 with nogil: 1lmnkhdo
227 HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, cu_loc)) 1lmnkhdo
228 ELSE:
229 cdef int dev_int = -1 if loc is None else _to_legacy_device(loc)
230 with nogil:
231 HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, dev_int))
234def prefetch_batch(
235 stream: Stream | GraphBuilder,
236 buffers: Sequence[Buffer],
237 locations: Device | Host | Sequence[Device | Host],
238) -> None:
239 """Prefetch a batch of managed-memory ranges to target locations.
241 Requires CUDA 13+. For a single buffer, use
242 :meth:`ManagedBuffer.prefetch` instead.
244 Parameters
245 ----------
246 stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
247 Stream for the asynchronous prefetch. First positional, required
248 (mirrors :func:`launch`).
249 buffers : Sequence[:class:`Buffer`]
250 Two or more managed allocations to operate on.
251 locations : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...]
252 Target location(s). A single location applies to all buffers; a
253 sequence must match ``len(buffers)``.
255 Notes
256 -----
257 On a CUDA 12 build, falls back to a Python-level loop calling
258 ``cuMemPrefetchAsync`` per buffer (no batched driver entry point on
259 CUDA 12). CUDA 13 builds use ``cuMemPrefetchBatchAsync`` directly.
260 """
261 cdef tuple bufs = _coerce_batch_buffers(buffers, "prefetch_batch") 1abcfsy
262 cdef Py_ssize_t n = len(bufs) 1abcfs
263 cdef tuple locs = _broadcast_locations(locations, n, False, "prefetch_batch") 1abcfs
264 cdef Stream s = Stream_accept(stream) 1abcf
266 cdef Buffer buf
267 for buf in bufs: 1abcf
268 _require_managed_buffer(buf, "prefetch_batch") 1abcf
270 _do_batch_prefetch(bufs, locs, s) 1abcf
273def _do_single_prefetch_py(Buffer buf, location, stream):
274 """Internal: single-buffer prefetch for ManagedBuffer.prefetch().
276 Uses cuMemPrefetchAsync (works on CUDA 12 and 13).
277 """
278 _require_managed_buffer(buf, "prefetch") 1ijepqtuv
279 cdef object loc = _coerce_location(location, allow_none=False) 1ijepqtu
280 cdef Stream s = Stream_accept(stream) 1ijepq
281 _do_single_prefetch(buf, loc, s) 1ijepq
284cdef void _do_single_prefetch(Buffer buf, object loc, Stream s):
285 cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) 1ijepq
286 cdef size_t nbytes = buf._size 1ijepq
287 cdef cydriver.CUstream hstream = as_cu(s._h_stream) 1ijepq
288 IF CUDA_CORE_BUILD_MAJOR >= 13:
289 cdef cydriver.CUmemLocation cu_loc = _to_cumemlocation(loc) 1ijepq
290 with nogil: 1ijepq
291 HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, cu_loc, 0, hstream)) 1ijepq
292 ELSE:
293 cdef int dev_int = _to_legacy_device(loc)
294 with nogil:
295 HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, dev_int, hstream))
298IF CUDA_CORE_BUILD_MAJOR >= 13:
299 # Function-pointer type for cuMemPrefetchBatchAsync /
300 # cuMemDiscardAndPrefetchBatchAsync; both have identical signatures.
301 ctypedef cydriver.CUresult (*_BatchPrefetchFn)(
302 cydriver.CUdeviceptr*, size_t*, size_t,
303 cydriver.CUmemLocation*, size_t*, size_t,
304 unsigned long long, cydriver.CUstream,
305 ) except ?cydriver.CUDA_ERROR_NOT_FOUND nogil
308 def _read_preferred_location_v2(Buffer buf):
309 """Internal: read preferred_location with full NUMA detail.
311 Bypasses cuda.bindings.driver.cuMemRangeGetAttribute (whose
312 attribute allowlist doesn't yet include the cu13 _TYPE / _ID
313 attributes) by calling cydriver directly.
315 Returns Device | Host | None.
316 """
317 cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) 1hd
318 cdef size_t nbytes = buf._size 1hd
319 cdef int loc_type = 0 1hd
320 cdef int loc_id = 0 1hd
321 with nogil: 1hd
322 HANDLE_RETURN(cydriver.cuMemRangeGetAttribute( 1hd
323 <void*>&loc_type, sizeof(int),
324 cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE,
325 cu_ptr, nbytes,
326 ))
327 HANDLE_RETURN(cydriver.cuMemRangeGetAttribute( 1hd
328 <void*>&loc_id, sizeof(int),
329 cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID,
330 cu_ptr, nbytes,
331 ))
332 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: 1hd
333 from cuda.core._device import Device 1d
334 return Device(loc_id) 1d
335 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST: 1hd
336 return Host() 1d
337 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA: 1hd
338 return Host(numa_id=loc_id)
339 if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: 1hd
340 return Host.numa_current()
341 return None # CU_MEM_LOCATION_TYPE_INVALID — no preferred location 1hd
344 cdef void _do_batch_prefetch_op(tuple bufs, tuple locs, Stream s, _BatchPrefetchFn fn):
345 """Shared body for batched prefetch / discard-and-prefetch."""
346 cdef Py_ssize_t n = len(bufs) 1abecf
347 cdef cydriver.CUstream hstream = as_cu(s._h_stream) 1abecf
348 cdef vector[cydriver.CUdeviceptr] ptrs
349 cdef vector[size_t] sizes
350 cdef vector[cydriver.CUmemLocation] loc_arr
351 cdef vector[size_t] loc_indices
352 ptrs.resize(n) 1abecf
353 sizes.resize(n) 1abecf
354 loc_arr.resize(n) 1abecf
355 loc_indices.resize(n) 1abecf
356 cdef Buffer buf
357 cdef Py_ssize_t i
358 for i in range(n): 1abecf
359 buf = <Buffer>bufs[i] 1abecf
360 ptrs[i] = as_cu(buf._h_ptr) 1abecf
361 sizes[i] = buf._size 1abecf
362 loc_arr[i] = _to_cumemlocation(locs[i]) 1abecf
363 loc_indices[i] = <size_t>i 1abecf
364 with nogil: 1abecf
365 HANDLE_RETURN(fn( 1abecf
366 ptrs.data(), sizes.data(), <size_t>n,
367 loc_arr.data(), loc_indices.data(), <size_t>n,
368 0, hstream,
369 ))
370ELSE:
371 def _read_preferred_location_v2(Buffer buf):
372 # Symbol exists so _managed_buffer.py can `from ... import
373 # _read_preferred_location_v2` unconditionally at module top.
374 # `ManagedBuffer.preferred_location` gates on both
375 # binding_version() and driver_version() >= (13, 0, 0) before
376 # calling, so this path is unreachable on a cu12 build.
377 raise NotImplementedError(
378 "_read_preferred_location_v2 requires a CUDA 13 build of cuda.core"
379 )
382cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s):
383 IF CUDA_CORE_BUILD_MAJOR >= 13:
384 _do_batch_prefetch_op(bufs, locs, s, cydriver.cuMemPrefetchBatchAsync) 1abcf
385 ELSE:
386 # cu12 has no cuMemPrefetchBatchAsync; loop per-range.
387 cdef Buffer buf
388 cdef Py_ssize_t i
389 cdef Py_ssize_t n = len(bufs)
390 for i in range(n):
391 buf = <Buffer>bufs[i]
392 _do_single_prefetch(buf, locs[i], s)
395def discard_prefetch_batch(
396 stream: Stream | GraphBuilder,
397 buffers: Sequence[Buffer],
398 locations: Device | Host | Sequence[Device | Host],
399) -> None:
400 """Discard a batch of managed-memory ranges and prefetch them to target locations.
402 Requires CUDA 13+. For a single buffer, use
403 :meth:`ManagedBuffer.discard_prefetch` instead.
405 Parameters
406 ----------
407 stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
408 Stream for the asynchronous operation. First positional, required
409 (mirrors :func:`launch`).
410 buffers : Sequence[:class:`Buffer`]
411 Two or more managed allocations to discard and re-prefetch.
412 locations : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...]
413 Target location(s). A single location applies to all buffers;
414 a sequence must match ``len(buffers)``.
416 Raises
417 ------
418 NotImplementedError
419 On a CUDA 12 build of ``cuda.core``.
420 """
421 cdef tuple bufs = _coerce_batch_buffers(buffers, "discard_prefetch_batch") 1brx
422 cdef Py_ssize_t n = len(bufs) 1br
423 cdef tuple locs = _broadcast_locations(locations, n, False, "discard_prefetch_batch") 1br
424 cdef Stream s = Stream_accept(stream) 1b
426 cdef Buffer buf
427 for buf in bufs: 1b
428 _require_managed_buffer(buf, "discard_prefetch_batch") 1b
430 _do_batch_discard_prefetch(bufs, locs, s) 1b
433def _do_single_discard_prefetch_py(Buffer buf, location, stream):
434 """Internal: single-buffer discard+prefetch for
435 ManagedBuffer.discard_prefetch()."""
436 _require_managed_buffer(buf, "discard_prefetch") 1ev
437 cdef object loc = _coerce_location(location, allow_none=False) 1e
438 cdef Stream s = Stream_accept(stream) 1e
439 cdef tuple bufs = (buf,) 1e
440 cdef tuple locs = (loc,) 1e
441 _do_batch_discard_prefetch(bufs, locs, s) 1e
444cdef void _do_batch_discard_prefetch(tuple bufs, tuple locs, Stream s):
445 IF CUDA_CORE_BUILD_MAJOR >= 13:
446 _do_batch_prefetch_op(bufs, locs, s, cydriver.cuMemDiscardAndPrefetchBatchAsync) 1be
447 ELSE:
448 raise NotImplementedError(
449 "discard_prefetch requires a CUDA 13 build of cuda.core"
450 )