Coverage for cuda / core / experimental / _memoryview.pyx: 49%
225 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-10 01:19 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-10 01:19 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
5from ._dlpack cimport *
7import functools
8import warnings
9from typing import Optional
11import numpy
13from cuda.core.experimental._utils.cuda_utils import handle_return, driver
14from cuda.core.experimental._utils cimport cuda_utils
17# TODO(leofang): support NumPy structured dtypes
20cdef class StridedMemoryView:
21 """A dataclass holding metadata of a strided dense array/tensor.
23 A :obj:`StridedMemoryView` instance can be created in two ways:
25 1. Using the :obj:`args_viewable_as_strided_memory` decorator (recommended)
26 2. Explicit construction, see below
28 This object supports both DLPack (up to v1.0) and CUDA Array Interface
29 (CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol
30 first, then the CAI protocol. A :obj:`BufferError` is raised if neither is
31 supported.
33 Since either way would take a consumer stream, for DLPack it is passed to
34 ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a
35 stream order will be established between the consumer stream and the
36 producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if
37 ``cudaStreamWaitEvent`` is called by this method.
39 To opt-out of the stream ordering operation in either DLPack or CAI,
40 please pass ``stream_ptr=-1``. Note that this deviates (on purpose)
41 from the semantics of ``obj.__dlpack__(stream=None, ...)`` since ``cuda.core``
42 does not encourage using the (legacy) default/null stream, but is
43 consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be
44 internally passed to ``obj.__dlpack__()`` instead.
46 Attributes
47 ----------
48 ptr : int
49 Pointer to the tensor buffer (as a Python `int`).
50 shape : tuple
51 Shape of the tensor.
52 strides : Optional[tuple]
53 Strides of the tensor (in **counts**, not bytes).
54 dtype: numpy.dtype
55 Data type of the tensor.
56 device_id : int
57 The device ID for where the tensor is located. It is -1 for CPU tensors
58 (meaning those only accessible from the host).
59 is_device_accessible : bool
60 Whether the tensor data can be accessed on the GPU.
61 readonly: bool
62 Whether the tensor data can be modified in place.
63 exporting_obj : Any
64 A reference to the original tensor object that is being viewed.
66 Parameters
67 ----------
68 obj : Any
69 Any objects that supports either DLPack (up to v1.0) or CUDA Array
70 Interface (v3).
71 stream_ptr: int
72 The pointer address (as Python `int`) to the **consumer** stream.
73 Stream ordering will be properly established unless ``-1`` is passed.
74 """
75 cdef readonly:
76 intptr_t ptr
77 int device_id
78 bint is_device_accessible
79 bint readonly
80 object exporting_obj
82 cdef:
83 # If using dlpack, this is a strong reference to the result of
84 # obj.__dlpack__() so we can lazily create shape and strides from
85 # it later. If using CAI, this is a reference to the source
86 # `__cuda_array_interface__` object.
87 object metadata
89 # The tensor object if has obj has __dlpack__, otherwise must be NULL
90 DLTensor *dl_tensor
92 # Memoized properties
93 tuple _shape
94 tuple _strides
95 # a `None` value for _strides has defined meaning in dlpack and
96 # the cuda array interface, meaning C order, contiguous.
97 #
98 # this flag helps prevent unnecessary recompuation of _strides
99 bint _strides_init
100 object _dtype
102 def __init__(self, obj: object = None, stream_ptr: int | None = None) -> None:
103 cdef str clsname = self.__class__.__name__
104 if obj is not None:
105 # populate self's attributes
106 if check_has_dlpack(obj):
107 warnings.warn(
108 f"Constructing a {clsname} directly from a DLPack-supporting object is deprecated; "
109 "Use `StridedMemoryView.from_dlpack` or `StridedMemoryView.from_any_interface` instead.",
110 DeprecationWarning,
111 stacklevel=2,
112 )
113 view_as_dlpack(obj, stream_ptr, self)
114 else:
115 warnings.warn(
116 f"Constructing a {clsname} directly from a CUDA-array-interface-supporting object is deprecated; "
117 "Use `StridedMemoryView.from_cuda_array_interface` or `StridedMemoryView.from_any_interface` instead.",
118 DeprecationWarning,
119 stacklevel=2,
120 )
121 view_as_cai(obj, stream_ptr, self)
122 else:
123 warnings.warn(
124 f"Constructing an empty {clsname} is deprecated; "
125 "use one of the classmethods `from_dlpack`, `from_cuda_array_interface` or `from_any_interface` "
126 "to construct a StridedMemoryView from an object",
127 DeprecationWarning,
128 stacklevel=2,
129 )
131 @classmethod
132 def from_dlpack(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView:
133 cdef StridedMemoryView buf
134 with warnings.catch_warnings():
135 warnings.simplefilter("ignore")
136 buf = cls()
137 view_as_dlpack(obj, stream_ptr, buf)
138 return buf
140 @classmethod
141 def from_cuda_array_interface(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView:
142 cdef StridedMemoryView buf
143 with warnings.catch_warnings():
144 warnings.simplefilter("ignore")
145 buf = cls()
146 view_as_cai(obj, stream_ptr, buf)
147 return buf
149 @classmethod
150 def from_any_interface(cls, obj: object, stream_ptr: int | None = None) -> StridedMemoryView:
151 if check_has_dlpack(obj):
152 return cls.from_dlpack(obj, stream_ptr)
153 return cls.from_cuda_array_interface(obj, stream_ptr)
155 def __dealloc__(self):
156 if self.dl_tensor == NULL:
157 return
159 if cpython.PyCapsule_IsValid(
160 self.metadata, DLPACK_VERSIONED_TENSOR_USED_NAME):
161 data = cpython.PyCapsule_GetPointer(
162 self.metadata, DLPACK_VERSIONED_TENSOR_USED_NAME)
163 dlm_tensor_ver = <DLManagedTensorVersioned*>data
164 dlm_tensor_ver.deleter(dlm_tensor_ver)
165 elif cpython.PyCapsule_IsValid(
166 self.metadata, DLPACK_TENSOR_USED_NAME):
167 data = cpython.PyCapsule_GetPointer(
168 self.metadata, DLPACK_TENSOR_USED_NAME)
169 dlm_tensor = <DLManagedTensor*>data
170 dlm_tensor.deleter(dlm_tensor)
172 @property
173 def shape(self) -> tuple[int, ...]:
174 if self._shape is None:
175 if self.exporting_obj is not None:
176 if self.dl_tensor != NULL:
177 self._shape = cuda_utils.carray_int64_t_to_tuple(
178 self.dl_tensor.shape,
179 self.dl_tensor.ndim
180 )
181 else:
182 self._shape = self.metadata["shape"]
183 else:
184 self._shape = ()
185 return self._shape
187 @property
188 def strides(self) -> Optional[tuple[int, ...]]:
189 cdef int itemsize
190 if self._strides_init is False:
191 if self.exporting_obj is not None:
192 if self.dl_tensor != NULL:
193 if self.dl_tensor.strides:
194 self._strides = cuda_utils.carray_int64_t_to_tuple(
195 self.dl_tensor.strides,
196 self.dl_tensor.ndim
197 )
198 else:
199 # This is a Python interface anyway, so not much point
200 # to using the optimization in cuda_utils.carray_int64_t_to_tuple
201 strides = self.metadata.get("strides")
202 if strides is not None:
203 itemsize = self.dtype.itemsize
204 self._strides = tuple(x // itemsize for x in strides)
205 self._strides_init = True
206 return self._strides
208 @property
209 def dtype(self) -> Optional[numpy.dtype]:
210 if self._dtype is None:
211 if self.exporting_obj is not None:
212 if self.dl_tensor != NULL:
213 self._dtype = dtype_dlpack_to_numpy(&self.dl_tensor.dtype)
214 else:
215 # TODO: this only works for built-in numeric types
216 self._dtype = numpy.dtype(self.metadata["typestr"])
217 return self._dtype
219 def __repr__(self):
220 return (f"StridedMemoryView(ptr={self.ptr},\n"
221 + f" shape={self.shape},\n"
222 + f" strides={self.strides},\n"
223 + f" dtype={get_simple_repr(self.dtype)},\n"
224 + f" device_id={self.device_id},\n"
225 + f" is_device_accessible={self.is_device_accessible},\n"
226 + f" readonly={self.readonly},\n"
227 + f" exporting_obj={get_simple_repr(self.exporting_obj)})")
230cdef str get_simple_repr(obj):
231 # TODO: better handling in np.dtype objects
232 cdef object obj_class
233 cdef str obj_repr
234 if isinstance(obj, type):
235 obj_class = obj
236 else:
237 obj_class = obj.__class__
238 if obj_class.__module__ in (None, "builtins"):
239 obj_repr = obj_class.__name__
240 else:
241 obj_repr = f"{obj_class.__module__}.{obj_class.__name__}"
242 return obj_repr
246cdef bint check_has_dlpack(obj) except*:
247 cdef bint has_dlpack
248 if hasattr(obj, "__dlpack__") and hasattr(obj, "__dlpack_device__"):
249 has_dlpack = True
250 elif hasattr(obj, "__cuda_array_interface__"):
251 has_dlpack = False
252 else:
253 raise RuntimeError(
254 "the input object does not support any data exchange protocol")
255 return has_dlpack
258cdef class _StridedMemoryViewProxy:
259 cdef readonly:
260 object obj
261 bint has_dlpack
263 def __init__(self, obj):
264 self.obj = obj
265 self.has_dlpack = check_has_dlpack(obj)
267 cpdef StridedMemoryView view(self, stream_ptr=None):
268 if self.has_dlpack:
269 return StridedMemoryView.from_dlpack(self.obj, stream_ptr)
270 else:
271 return StridedMemoryView.from_cuda_array_interface(self.obj, stream_ptr)
274cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
275 cdef int dldevice, device_id
276 cdef bint is_device_accessible, is_readonly
277 is_device_accessible = False
278 dldevice, device_id = obj.__dlpack_device__()
279 if dldevice == _kDLCPU:
280 assert device_id == 0
281 device_id = -1
282 if stream_ptr is None:
283 raise BufferError("stream=None is ambiguous with view()")
284 elif stream_ptr == -1:
285 stream_ptr = None
286 elif dldevice == _kDLCUDA:
287 assert device_id >= 0
288 is_device_accessible = True
289 # no need to check other stream values, it's a pass-through
290 if stream_ptr is None:
291 raise BufferError("stream=None is ambiguous with view()")
292 elif dldevice in (_kDLCUDAHost, _kDLCUDAManaged):
293 is_device_accessible = True
294 # just do a pass-through without any checks, as pinned/managed memory can be
295 # accessed on both host and device
296 else:
297 raise BufferError("device not supported")
299 cdef object capsule
300 try:
301 capsule = obj.__dlpack__(
302 stream=int(stream_ptr) if stream_ptr else None,
303 max_version=(DLPACK_MAJOR_VERSION, DLPACK_MINOR_VERSION))
304 except TypeError:
305 capsule = obj.__dlpack__(
306 stream=int(stream_ptr) if stream_ptr else None)
308 cdef void* data = NULL
309 cdef DLTensor* dl_tensor
310 cdef DLManagedTensorVersioned* dlm_tensor_ver
311 cdef DLManagedTensor* dlm_tensor
312 cdef const char *used_name
313 if cpython.PyCapsule_IsValid(
314 capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME):
315 data = cpython.PyCapsule_GetPointer(
316 capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME)
317 dlm_tensor_ver = <DLManagedTensorVersioned*>data
318 dl_tensor = &dlm_tensor_ver.dl_tensor
319 is_readonly = bool((dlm_tensor_ver.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0)
320 used_name = DLPACK_VERSIONED_TENSOR_USED_NAME
321 elif cpython.PyCapsule_IsValid(
322 capsule, DLPACK_TENSOR_UNUSED_NAME):
323 data = cpython.PyCapsule_GetPointer(
324 capsule, DLPACK_TENSOR_UNUSED_NAME)
325 dlm_tensor = <DLManagedTensor*>data
326 dl_tensor = &dlm_tensor.dl_tensor
327 is_readonly = False
328 used_name = DLPACK_TENSOR_USED_NAME
329 else:
330 assert False
332 cpython.PyCapsule_SetName(capsule, used_name)
334 cdef StridedMemoryView buf = StridedMemoryView() if view is None else view
335 buf.dl_tensor = dl_tensor
336 buf.metadata = capsule
337 buf.ptr = <intptr_t>(dl_tensor.data)
338 buf.device_id = device_id
339 buf.is_device_accessible = is_device_accessible
340 buf.readonly = is_readonly
341 buf.exporting_obj = obj
343 return buf
346cdef object dtype_dlpack_to_numpy(DLDataType* dtype):
347 cdef int bits = dtype.bits
348 if dtype.lanes != 1:
349 # TODO: return a NumPy structured dtype?
350 raise NotImplementedError(
351 f'vector dtypes (lanes={dtype.lanes}) is not supported')
352 if dtype.code == kDLUInt:
353 if bits == 8:
354 np_dtype = numpy.uint8
355 elif bits == 16:
356 np_dtype = numpy.uint16
357 elif bits == 32:
358 np_dtype = numpy.uint32
359 elif bits == 64:
360 np_dtype = numpy.uint64
361 else:
362 raise TypeError('uint{} is not supported.'.format(bits))
363 elif dtype.code == kDLInt:
364 if bits == 8:
365 np_dtype = numpy.int8
366 elif bits == 16:
367 np_dtype = numpy.int16
368 elif bits == 32:
369 np_dtype = numpy.int32
370 elif bits == 64:
371 np_dtype = numpy.int64
372 else:
373 raise TypeError('int{} is not supported.'.format(bits))
374 elif dtype.code == kDLFloat:
375 if bits == 16:
376 np_dtype = numpy.float16
377 elif bits == 32:
378 np_dtype = numpy.float32
379 elif bits == 64:
380 np_dtype = numpy.float64
381 else:
382 raise TypeError('float{} is not supported.'.format(bits))
383 elif dtype.code == kDLComplex:
384 # TODO(leofang): support complex32
385 if bits == 64:
386 np_dtype = numpy.complex64
387 elif bits == 128:
388 np_dtype = numpy.complex128
389 else:
390 raise TypeError('complex{} is not supported.'.format(bits))
391 elif dtype.code == kDLBool:
392 if bits == 8:
393 np_dtype = numpy.bool_
394 else:
395 raise TypeError(f'{bits}-bit bool is not supported')
396 elif dtype.code == kDLBfloat:
397 # TODO(leofang): use ml_dtype.bfloat16?
398 raise NotImplementedError('bfloat is not supported yet')
399 else:
400 raise TypeError('Unsupported dtype. dtype code: {}'.format(dtype.code))
402 # We want the dtype object not just the type object
403 return numpy.dtype(np_dtype)
406cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
407 cdef dict cai_data = obj.__cuda_array_interface__
408 if cai_data["version"] < 3:
409 raise BufferError("only CUDA Array Interface v3 or above is supported")
410 if cai_data.get("mask") is not None:
411 raise BufferError("mask is not supported")
412 if stream_ptr is None:
413 raise BufferError("stream=None is ambiguous with view()")
415 cdef StridedMemoryView buf = StridedMemoryView() if view is None else view
416 buf.exporting_obj = obj
417 buf.metadata = cai_data
418 buf.dl_tensor = NULL
419 buf.ptr, buf.readonly = cai_data["data"]
420 buf.is_device_accessible = True
421 buf.device_id = handle_return(
422 driver.cuPointerGetAttribute(
423 driver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
424 buf.ptr))
426 cdef intptr_t producer_s, consumer_s
427 stream_ptr = int(stream_ptr)
428 if stream_ptr != -1:
429 stream = cai_data.get("stream")
430 if stream is not None:
431 producer_s = <intptr_t>(stream)
432 consumer_s = <intptr_t>(stream_ptr)
433 assert producer_s > 0
434 # establish stream order
435 if producer_s != consumer_s:
436 e = handle_return(driver.cuEventCreate(
437 driver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
438 handle_return(driver.cuEventRecord(e, producer_s))
439 handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0))
440 handle_return(driver.cuEventDestroy(e))
442 return buf
445def args_viewable_as_strided_memory(tuple arg_indices):
446 """
447 Decorator to create proxy objects to :obj:`StridedMemoryView` for the
448 specified positional arguments.
450 This allows array/tensor attributes to be accessed inside the function
451 implementation, while keeping the function body array-library-agnostic (if
452 desired).
454 Inside the decorated function, the specified arguments become instances
455 of an (undocumented) proxy type, regardless of its original source. A
456 :obj:`StridedMemoryView` instance can be obtained by passing the (consumer)
457 stream pointer (as a Python `int`) to the proxies's ``view()`` method. For
458 example:
460 .. code-block:: python
462 @args_viewable_as_strided_memory((1,))
463 def my_func(arg0, arg1, arg2, stream: Stream):
464 # arg1 can be any object supporting DLPack or CUDA Array Interface
465 view = arg1.view(stream.handle)
466 assert isinstance(view, StridedMemoryView)
467 ...
469 Parameters
470 ----------
471 arg_indices : tuple
472 The indices of the target positional arguments.
473 """
474 def wrapped_func_with_indices(func):
475 @functools.wraps(func)
476 def wrapped_func(*args, **kwargs):
477 args = list(args)
478 cdef int idx
479 for idx in arg_indices:
480 args[idx] = _StridedMemoryViewProxy(args[idx])
481 return func(*args, **kwargs)
482 return wrapped_func
483 return wrapped_func_with_indices