Coverage for cuda / core / experimental / _memory / _buffer.pyx: 91%
158 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-10 01:19 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-10 01:19 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
5from __future__ import annotations
7from libc.stdint cimport uintptr_t, int64_t, uint64_t
9from cuda.bindings cimport cydriver
10from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
11from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer
12from cuda.core.experimental._memory cimport _ipc
13from cuda.core.experimental._stream cimport Stream_accept, Stream
14from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
16import abc
17from typing import TypeVar, Union
19from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
20from cuda.core.experimental._utils.cuda_utils import driver
22__all__ = ['Buffer', 'MemoryResource']
25DevicePointerT = Union[driver.CUdeviceptr, int, None]
26"""
27A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting
28:attr:`Buffer.handle`.
29"""
31cdef class Buffer:
32 """Represent a handle to allocated memory.
34 This generic object provides a unified representation for how
35 different memory resources are to give access to their memory
36 allocations.
38 Support for data interchange mechanisms are provided by DLPack.
39 """
40 def __cinit__(self):
41 self._clear()
43 def _clear(self):
44 self._ptr = 0
45 self._size = 0
46 self._memory_resource = None
47 self._ipc_data = None
48 self._ptr_obj = None
49 self._alloc_stream = None
51 def __init__(self, *args, **kwargs):
52 raise RuntimeError("Buffer objects cannot be instantiated directly. "
53 "Please use MemoryResource APIs.")
55 @classmethod
56 def _init(
57 cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None,
58 stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None
59 ):
60 cdef Buffer self = Buffer.__new__(cls)
61 self._ptr = <uintptr_t>(int(ptr))
62 self._ptr_obj = ptr
63 self._size = size
64 self._memory_resource = mr
65 self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
66 self._alloc_stream = <Stream>(stream) if stream is not None else None
67 return self
69 def __dealloc__(self):
70 self.close(self._alloc_stream)
72 def __reduce__(self):
73 # Must not serialize the parent's stream!
74 return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor())
76 @staticmethod
77 def from_handle(
78 ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None
79 ) -> Buffer:
80 """Create a new :class:`Buffer` object from a pointer.
82 Parameters
83 ----------
84 ptr : :obj:`~_memory.DevicePointerT`
85 Allocated buffer handle object
86 size : int
87 Memory size of the buffer
88 mr : :obj:`~_memory.MemoryResource`, optional
89 Memory resource associated with the buffer
90 """
91 # TODO: It is better to take a stream for latter deallocation
92 return Buffer._init(ptr, size, mr=mr)
94 @classmethod
95 def from_ipc_descriptor(
96 cls, mr: DeviceMemoryResource, ipc_descriptor: IPCBufferDescriptor,
97 stream: Stream = None
98 ) -> Buffer:
99 """Import a buffer that was exported from another process."""
100 return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_descriptor, stream)
102 def get_ipc_descriptor(self) -> IPCBufferDescriptor:
103 """Export a buffer allocated for sharing between processes."""
104 if self._ipc_data is None:
105 self._ipc_data = IPCDataForBuffer(_ipc.Buffer_get_ipc_descriptor(self), False)
106 return self._ipc_data.ipc_descriptor
108 def close(self, stream: Stream | GraphBuilder | None = None):
109 """Deallocate this buffer asynchronously on the given stream.
111 This buffer is released back to their memory resource
112 asynchronously on the given stream.
114 Parameters
115 ----------
116 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
117 The stream object to use for asynchronous deallocation. If None,
118 the behavior depends on the underlying memory resource.
119 """
120 Buffer_close(self, stream)
122 def copy_to(self, dst: Buffer = None, *, stream: Stream | GraphBuilder) -> Buffer:
123 """Copy from this buffer to the dst buffer asynchronously on the given stream.
125 Copies the data from this buffer to the provided dst buffer.
126 If the dst buffer is not provided, then a new buffer is first
127 allocated using the associated memory resource before the copy.
129 Parameters
130 ----------
131 dst : :obj:`~_memory.Buffer`
132 Source buffer to copy data from
133 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
134 Keyword argument specifying the stream for the
135 asynchronous copy
137 """
138 stream = Stream_accept(stream)
139 cdef Stream s_stream = <Stream>stream
140 cdef size_t src_size = self._size
142 if dst is None:
143 if self._memory_resource is None:
144 raise ValueError("a destination buffer must be provided (this "
145 "buffer does not have a memory_resource)")
146 dst = self._memory_resource.allocate(src_size, stream)
148 cdef size_t dst_size = dst._size
149 if dst_size != src_size:
150 raise ValueError( "buffer sizes mismatch between src and dst (sizes "
151 f"are: src={src_size}, dst={dst_size})"
152 )
153 cdef cydriver.CUstream s = s_stream._handle
154 with nogil:
155 HANDLE_RETURN(cydriver.cuMemcpyAsync(
156 <cydriver.CUdeviceptr>dst._ptr,
157 <cydriver.CUdeviceptr>self._ptr,
158 src_size,
159 s
160 ))
161 return dst
163 def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder):
164 """Copy from the src buffer to this buffer asynchronously on the given stream.
166 Parameters
167 ----------
168 src : :obj:`~_memory.Buffer`
169 Source buffer to copy data from
170 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
171 Keyword argument specifying the stream for the
172 asynchronous copy
174 """
175 stream = Stream_accept(stream)
176 cdef Stream s_stream = <Stream>stream
177 cdef size_t dst_size = self._size
178 cdef size_t src_size = src._size
180 if src_size != dst_size:
181 raise ValueError( "buffer sizes mismatch between src and dst (sizes "
182 f"are: src={src_size}, dst={dst_size})"
183 )
184 cdef cydriver.CUstream s = s_stream._handle
185 with nogil:
186 HANDLE_RETURN(cydriver.cuMemcpyAsync(
187 <cydriver.CUdeviceptr>self._ptr,
188 <cydriver.CUdeviceptr>src._ptr,
189 dst_size,
190 s
191 ))
193 def fill(self, value: int, width: int, *, stream: Stream | GraphBuilder):
194 """Fill this buffer with a value pattern asynchronously on the given stream.
196 Parameters
197 ----------
198 value : int
199 Integer value to fill the buffer with
200 width : int
201 Width in bytes for each element (must be 1, 2, or 4)
202 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
203 Keyword argument specifying the stream for the asynchronous fill
205 Raises
206 ------
207 ValueError
208 If width is not 1, 2, or 4, if value is out of range for the width,
209 or if buffer size is not divisible by width
211 """
212 cdef Stream s_stream = Stream_accept(stream)
213 cdef unsigned char c_value8
214 cdef unsigned short c_value16
215 cdef unsigned int c_value32
216 cdef size_t N
218 # Validate width
219 if width not in (1, 2, 4):
220 raise ValueError(f"width must be 1, 2, or 4, got {width}")
222 # Validate buffer size modulus.
223 cdef size_t buffer_size = self._size
224 if buffer_size % width != 0:
225 raise ValueError(f"buffer size ({buffer_size}) must be divisible by width ({width})")
227 # Map width (bytes) to bitwidth and validate value
228 cdef int bitwidth = width * 8
229 _validate_value_against_bitwidth(bitwidth, value, is_signed=False)
231 # Validate value fits in width and perform fill
232 cdef cydriver.CUstream s = s_stream._handle
233 if width == 1:
234 c_value8 = <unsigned char>value
235 N = buffer_size
236 with nogil:
237 HANDLE_RETURN(cydriver.cuMemsetD8Async(<cydriver.CUdeviceptr>self._ptr, c_value8, N, s))
238 elif width == 2:
239 c_value16 = <unsigned short>value
240 N = buffer_size // 2
241 with nogil:
242 HANDLE_RETURN(cydriver.cuMemsetD16Async(<cydriver.CUdeviceptr>self._ptr, c_value16, N, s))
243 else: # width == 4
244 c_value32 = <unsigned int>value
245 N = buffer_size // 4
246 with nogil:
247 HANDLE_RETURN(cydriver.cuMemsetD32Async(<cydriver.CUdeviceptr>self._ptr, c_value32, N, s))
249 def __dlpack__(
250 self,
251 *,
252 stream: int | None = None,
253 max_version: tuple[int, int] | None = None,
254 dl_device: tuple[int, int] | None = None,
255 copy: bool | None = None,
256 ) -> TypeVar("PyCapsule"):
257 # Note: we ignore the stream argument entirely (as if it is -1).
258 # It is the user's responsibility to maintain stream order.
259 if dl_device is not None:
260 raise BufferError("Sorry, not supported: dl_device other than None")
261 if copy is True:
262 raise BufferError("Sorry, not supported: copy=True")
263 if max_version is None:
264 versioned = False
265 else:
266 if not isinstance(max_version, tuple) or len(max_version) != 2:
267 raise BufferError(f"Expected max_version tuple[int, int], got {max_version}")
268 versioned = max_version >= (1, 0)
269 capsule = make_py_capsule(self, versioned)
270 return capsule
272 def __dlpack_device__(self) -> tuple[int, int]:
273 cdef bint d = self.is_device_accessible
274 cdef bint h = self.is_host_accessible
275 if d and (not h):
276 return (DLDeviceType.kDLCUDA, self.device_id)
277 if d and h:
278 # TODO: this can also be kDLCUDAManaged, we need more fine-grained checks
279 return (DLDeviceType.kDLCUDAHost, 0)
280 if (not d) and h:
281 return (DLDeviceType.kDLCPU, 0)
282 raise BufferError("buffer is neither device-accessible nor host-accessible")
284 def __buffer__(self, flags: int, /) -> memoryview:
285 # Support for Python-level buffer protocol as per PEP 688.
286 # This raises a BufferError unless:
287 # 1. Python is 3.12+
288 # 2. This Buffer object is host accessible
289 raise NotImplementedError("WIP: Buffer.__buffer__ hasn't been implemented yet.")
291 def __release_buffer__(self, buffer: memoryview, /):
292 # Supporting method paired with __buffer__.
293 raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.")
295 @property
296 def device_id(self) -> int:
297 """Return the device ordinal of this buffer."""
298 if self._memory_resource is not None:
299 return self._memory_resource.device_id
300 raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
302 @property
303 def handle(self) -> DevicePointerT:
304 """Return the buffer handle object.
306 .. caution::
308 This handle is a Python object. To get the memory address of the underlying C
309 handle, call ``int(Buffer.handle)``.
310 """
311 if self._ptr_obj is not None:
312 return self._ptr_obj
313 elif self._ptr:
314 return self._ptr
315 else:
316 # contract: Buffer is closed
317 return 0
319 @property
320 def is_device_accessible(self) -> bool:
321 """Return True if this buffer can be accessed by the GPU, otherwise False."""
322 if self._memory_resource is not None:
323 return self._memory_resource.is_device_accessible
324 raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
326 @property
327 def is_host_accessible(self) -> bool:
328 """Return True if this buffer can be accessed by the CPU, otherwise False."""
329 if self._memory_resource is not None:
330 return self._memory_resource.is_host_accessible
331 raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
333 @property
334 def is_mapped(self) -> bool:
335 """Return True if this buffer is mapped into the process via IPC."""
336 return getattr(self._ipc_data, "is_mapped", False)
339 @property
340 def memory_resource(self) -> MemoryResource:
341 """Return the memory resource associated with this buffer."""
342 return self._memory_resource
344 @property
345 def size(self) -> int:
346 """Return the memory size of this buffer."""
347 return self._size
350# Buffer Implementation
351# ---------------------
352cdef inline void Buffer_close(Buffer self, stream):
353 cdef Stream s
354 if self._ptr and self._memory_resource is not None:
355 s = Stream_accept(stream) if stream is not None else self._alloc_stream
356 self._memory_resource.deallocate(self._ptr, self._size, s)
357 self._ptr = 0
358 self._memory_resource = None
359 self._ptr_obj = None
360 self._alloc_stream = None
363cdef class MemoryResource:
364 """Abstract base class for memory resources that manage allocation and
365 deallocation of buffers.
367 Subclasses must implement methods for allocating and deallocation, as well
368 as properties associated with this memory resource from which all allocated
369 buffers will inherit. (Since all :class:`Buffer` instances allocated and
370 returned by the :meth:`allocate` method would hold a reference to self, the
371 buffer properties are retrieved simply by looking up the underlying memory
372 resource's respective property.)
373 """
375 @abc.abstractmethod
376 def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
377 """Allocate a buffer of the requested size.
379 Parameters
380 ----------
381 size : int
382 The size of the buffer to allocate, in bytes.
383 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
384 The stream on which to perform the allocation asynchronously.
385 If None, it is up to each memory resource implementation to decide
386 and document the behavior.
388 Returns
389 -------
390 Buffer
391 The allocated buffer object, which can be used for device or host operations
392 depending on the resource's properties.
393 """
394 ...
396 @abc.abstractmethod
397 def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None):
398 """Deallocate a buffer previously allocated by this resource.
400 Parameters
401 ----------
402 ptr : :obj:`~_memory.DevicePointerT`
403 The pointer or handle to the buffer to deallocate.
404 size : int
405 The size of the buffer to deallocate, in bytes.
406 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
407 The stream on which to perform the deallocation asynchronously.
408 If None, it is up to each memory resource implementation to decide
409 and document the behavior.
410 """
411 ...
414# Helper Functions
415# ----------------
416cdef void _validate_value_against_bitwidth(int bitwidth, int64_t value, bint is_signed=False) except *:
417 """Validate that a value fits within the representable range for a given bitwidth.
419 Parameters
420 ----------
421 bitwidth : int
422 Number of bits (e.g., 8, 16, 32)
423 value : int64_t
424 Value to validate
425 is_signed : bool, optional
426 Whether the value is signed (default: False)
428 Raises
429 ------
430 ValueError
431 If value is outside the representable range for the bitwidth
432 """
433 cdef int max_bits = bitwidth
434 assert max_bits < 64, f"bitwidth ({max_bits}) must be less than 64"
436 cdef int64_t min_value
437 cdef uint64_t max_value_unsigned
438 cdef int64_t max_value
440 if is_signed:
441 min_value = -(<int64_t>1 << (max_bits - 1))
442 max_value = (<int64_t>1 << (max_bits - 1)) - 1
443 else:
444 min_value = 0
445 max_value_unsigned = (<uint64_t>1 << max_bits) - 1
446 max_value = <int64_t>max_value_unsigned
448 if not min_value <= value <= max_value:
449 raise ValueError(
450 f"value must be in range [{min_value}, {max_value}]"
451 )