Coverage for cuda / core / experimental / _memoryview.pyx: 49%

225 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-10 01:19 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from ._dlpack cimport * 

6  

7import functools 

8import warnings 

9from typing import Optional 

10  

11import numpy 

12  

13from cuda.core.experimental._utils.cuda_utils import handle_return, driver 

14from cuda.core.experimental._utils cimport cuda_utils 

15  

16  

17# TODO(leofang): support NumPy structured dtypes 

18  

19  

20cdef class StridedMemoryView: 

21 """A dataclass holding metadata of a strided dense array/tensor. 

22  

23 A :obj:`StridedMemoryView` instance can be created in two ways: 

24  

25 1. Using the :obj:`args_viewable_as_strided_memory` decorator (recommended) 

26 2. Explicit construction, see below 

27  

28 This object supports both DLPack (up to v1.0) and CUDA Array Interface 

29 (CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol 

30 first, then the CAI protocol. A :obj:`BufferError` is raised if neither is 

31 supported. 

32  

33 Since either way would take a consumer stream, for DLPack it is passed to 

34 ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a 

35 stream order will be established between the consumer stream and the 

36 producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if 

37 ``cudaStreamWaitEvent`` is called by this method. 

38  

39 To opt-out of the stream ordering operation in either DLPack or CAI, 

40 please pass ``stream_ptr=-1``. Note that this deviates (on purpose) 

41 from the semantics of ``obj.__dlpack__(stream=None, ...)`` since ``cuda.core`` 

42 does not encourage using the (legacy) default/null stream, but is 

43 consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be 

44 internally passed to ``obj.__dlpack__()`` instead. 

45  

46 Attributes 

47 ---------- 

48 ptr : int 

49 Pointer to the tensor buffer (as a Python `int`). 

50 shape : tuple 

51 Shape of the tensor. 

52 strides : Optional[tuple] 

53 Strides of the tensor (in **counts**, not bytes). 

54 dtype: numpy.dtype 

55 Data type of the tensor. 

56 device_id : int 

57 The device ID for where the tensor is located. It is -1 for CPU tensors 

58 (meaning those only accessible from the host). 

59 is_device_accessible : bool 

60 Whether the tensor data can be accessed on the GPU. 

61 readonly: bool 

62 Whether the tensor data can be modified in place. 

63 exporting_obj : Any 

64 A reference to the original tensor object that is being viewed. 

65  

66 Parameters 

67 ---------- 

68 obj : Any 

69 Any objects that supports either DLPack (up to v1.0) or CUDA Array 

70 Interface (v3). 

71 stream_ptr: int 

72 The pointer address (as Python `int`) to the **consumer** stream. 

73 Stream ordering will be properly established unless ``-1`` is passed. 

74 """ 

75 cdef readonly: 

76 intptr_t ptr 

77 int device_id 

78 bint is_device_accessible 

79 bint readonly 

80 object exporting_obj 

81  

82 cdef: 

83 # If using dlpack, this is a strong reference to the result of 

84 # obj.__dlpack__() so we can lazily create shape and strides from 

85 # it later. If using CAI, this is a reference to the source 

86 # `__cuda_array_interface__` object. 

87 object metadata 

88  

89 # The tensor object if has obj has __dlpack__, otherwise must be NULL 

90 DLTensor *dl_tensor 

91  

92 # Memoized properties 

93 tuple _shape 

94 tuple _strides 

95 # a `None` value for _strides has defined meaning in dlpack and 

96 # the cuda array interface, meaning C order, contiguous. 

97 # 

98 # this flag helps prevent unnecessary recompuation of _strides 

99 bint _strides_init 

100 object _dtype 

101  

102 def __init__(self, obj: object = None, stream_ptr: int | None = None) -> None: 

103 cdef str clsname = self.__class__.__name__ 

104 if obj is not None: 

105 # populate self's attributes 

106 if check_has_dlpack(obj): 

107 warnings.warn( 

108 f"Constructing a {clsname} directly from a DLPack-supporting object is deprecated; " 

109 "Use `StridedMemoryView.from_dlpack` or `StridedMemoryView.from_any_interface` instead.", 

110 DeprecationWarning, 

111 stacklevel=2, 

112 ) 

113 view_as_dlpack(obj, stream_ptr, self) 

114 else: 

115 warnings.warn( 

116 f"Constructing a {clsname} directly from a CUDA-array-interface-supporting object is deprecated; " 

117 "Use `StridedMemoryView.from_cuda_array_interface` or `StridedMemoryView.from_any_interface` instead.", 

118 DeprecationWarning, 

119 stacklevel=2, 

120 ) 

121 view_as_cai(obj, stream_ptr, self) 

122 else: 

123 warnings.warn( 

124 f"Constructing an empty {clsname} is deprecated; " 

125 "use one of the classmethods `from_dlpack`, `from_cuda_array_interface` or `from_any_interface` " 

126 "to construct a StridedMemoryView from an object", 

127 DeprecationWarning, 

128 stacklevel=2, 

129 ) 

130  

131 @classmethod 

132 def from_dlpack(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView: 

133 cdef StridedMemoryView buf 

134 with warnings.catch_warnings(): 

135 warnings.simplefilter("ignore") 

136 buf = cls() 

137 view_as_dlpack(obj, stream_ptr, buf) 

138 return buf 

139  

140 @classmethod 

141 def from_cuda_array_interface(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView: 

142 cdef StridedMemoryView buf 

143 with warnings.catch_warnings(): 

144 warnings.simplefilter("ignore") 

145 buf = cls() 

146 view_as_cai(obj, stream_ptr, buf) 

147 return buf 

148  

149 @classmethod 

150 def from_any_interface(cls, obj: object, stream_ptr: int | None = None) -> StridedMemoryView: 

151 if check_has_dlpack(obj): 

152 return cls.from_dlpack(obj, stream_ptr) 

153 return cls.from_cuda_array_interface(obj, stream_ptr) 

154  

155 def __dealloc__(self): 

156 if self.dl_tensor == NULL: 

157 return 

158  

159 if cpython.PyCapsule_IsValid( 

160 self.metadata, DLPACK_VERSIONED_TENSOR_USED_NAME): 

161 data = cpython.PyCapsule_GetPointer( 

162 self.metadata, DLPACK_VERSIONED_TENSOR_USED_NAME) 

163 dlm_tensor_ver = <DLManagedTensorVersioned*>data 

164 dlm_tensor_ver.deleter(dlm_tensor_ver) 

165 elif cpython.PyCapsule_IsValid( 

166 self.metadata, DLPACK_TENSOR_USED_NAME): 

167 data = cpython.PyCapsule_GetPointer( 

168 self.metadata, DLPACK_TENSOR_USED_NAME) 

169 dlm_tensor = <DLManagedTensor*>data 

170 dlm_tensor.deleter(dlm_tensor) 

171  

172 @property 

173 def shape(self) -> tuple[int, ...]: 

174 if self._shape is None: 

175 if self.exporting_obj is not None: 

176 if self.dl_tensor != NULL: 

177 self._shape = cuda_utils.carray_int64_t_to_tuple( 

178 self.dl_tensor.shape, 

179 self.dl_tensor.ndim 

180 ) 

181 else: 

182 self._shape = self.metadata["shape"] 

183 else: 

184 self._shape = () 

185 return self._shape 

186  

187 @property 

188 def strides(self) -> Optional[tuple[int, ...]]: 

189 cdef int itemsize 

190 if self._strides_init is False: 

191 if self.exporting_obj is not None: 

192 if self.dl_tensor != NULL: 

193 if self.dl_tensor.strides: 

194 self._strides = cuda_utils.carray_int64_t_to_tuple( 

195 self.dl_tensor.strides, 

196 self.dl_tensor.ndim 

197 ) 

198 else: 

199 # This is a Python interface anyway, so not much point 

200 # to using the optimization in cuda_utils.carray_int64_t_to_tuple 

201 strides = self.metadata.get("strides") 

202 if strides is not None: 

203 itemsize = self.dtype.itemsize 

204 self._strides = tuple(x // itemsize for x in strides) 

205 self._strides_init = True 

206 return self._strides 

207  

208 @property 

209 def dtype(self) -> Optional[numpy.dtype]: 

210 if self._dtype is None: 

211 if self.exporting_obj is not None: 

212 if self.dl_tensor != NULL: 

213 self._dtype = dtype_dlpack_to_numpy(&self.dl_tensor.dtype) 

214 else: 

215 # TODO: this only works for built-in numeric types 

216 self._dtype = numpy.dtype(self.metadata["typestr"]) 

217 return self._dtype 

218  

219 def __repr__(self): 

220 return (f"StridedMemoryView(ptr={self.ptr},\n" 

221 + f" shape={self.shape},\n" 

222 + f" strides={self.strides},\n" 

223 + f" dtype={get_simple_repr(self.dtype)},\n" 

224 + f" device_id={self.device_id},\n" 

225 + f" is_device_accessible={self.is_device_accessible},\n" 

226 + f" readonly={self.readonly},\n" 

227 + f" exporting_obj={get_simple_repr(self.exporting_obj)})") 

228  

229  

230cdef str get_simple_repr(obj): 

231 # TODO: better handling in np.dtype objects 

232 cdef object obj_class 

233 cdef str obj_repr 

234 if isinstance(obj, type): 

235 obj_class = obj 

236 else: 

237 obj_class = obj.__class__ 

238 if obj_class.__module__ in (None, "builtins"): 

239 obj_repr = obj_class.__name__ 

240 else: 

241 obj_repr = f"{obj_class.__module__}.{obj_class.__name__}" 

242 return obj_repr 

243  

244  

245  

246cdef bint check_has_dlpack(obj) except*: 

247 cdef bint has_dlpack 

248 if hasattr(obj, "__dlpack__") and hasattr(obj, "__dlpack_device__"): 

249 has_dlpack = True 

250 elif hasattr(obj, "__cuda_array_interface__"): 

251 has_dlpack = False 

252 else: 

253 raise RuntimeError( 

254 "the input object does not support any data exchange protocol") 

255 return has_dlpack 

256  

257  

258cdef class _StridedMemoryViewProxy: 

259 cdef readonly: 

260 object obj 

261 bint has_dlpack 

262  

263 def __init__(self, obj): 

264 self.obj = obj 

265 self.has_dlpack = check_has_dlpack(obj) 

266  

267 cpdef StridedMemoryView view(self, stream_ptr=None): 

268 if self.has_dlpack: 

269 return StridedMemoryView.from_dlpack(self.obj, stream_ptr) 

270 else: 

271 return StridedMemoryView.from_cuda_array_interface(self.obj, stream_ptr) 

272  

273  

274cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): 

275 cdef int dldevice, device_id 

276 cdef bint is_device_accessible, is_readonly 

277 is_device_accessible = False 

278 dldevice, device_id = obj.__dlpack_device__() 

279 if dldevice == _kDLCPU: 

280 assert device_id == 0 

281 device_id = -1 

282 if stream_ptr is None: 

283 raise BufferError("stream=None is ambiguous with view()") 

284 elif stream_ptr == -1: 

285 stream_ptr = None 

286 elif dldevice == _kDLCUDA: 

287 assert device_id >= 0 

288 is_device_accessible = True 

289 # no need to check other stream values, it's a pass-through 

290 if stream_ptr is None: 

291 raise BufferError("stream=None is ambiguous with view()") 

292 elif dldevice in (_kDLCUDAHost, _kDLCUDAManaged): 

293 is_device_accessible = True 

294 # just do a pass-through without any checks, as pinned/managed memory can be 

295 # accessed on both host and device 

296 else: 

297 raise BufferError("device not supported") 

298  

299 cdef object capsule 

300 try: 

301 capsule = obj.__dlpack__( 

302 stream=int(stream_ptr) if stream_ptr else None, 

303 max_version=(DLPACK_MAJOR_VERSION, DLPACK_MINOR_VERSION)) 

304 except TypeError: 

305 capsule = obj.__dlpack__( 

306 stream=int(stream_ptr) if stream_ptr else None) 

307  

308 cdef void* data = NULL 

309 cdef DLTensor* dl_tensor 

310 cdef DLManagedTensorVersioned* dlm_tensor_ver 

311 cdef DLManagedTensor* dlm_tensor 

312 cdef const char *used_name 

313 if cpython.PyCapsule_IsValid( 

314 capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME): 

315 data = cpython.PyCapsule_GetPointer( 

316 capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME) 

317 dlm_tensor_ver = <DLManagedTensorVersioned*>data 

318 dl_tensor = &dlm_tensor_ver.dl_tensor 

319 is_readonly = bool((dlm_tensor_ver.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0) 

320 used_name = DLPACK_VERSIONED_TENSOR_USED_NAME 

321 elif cpython.PyCapsule_IsValid( 

322 capsule, DLPACK_TENSOR_UNUSED_NAME): 

323 data = cpython.PyCapsule_GetPointer( 

324 capsule, DLPACK_TENSOR_UNUSED_NAME) 

325 dlm_tensor = <DLManagedTensor*>data 

326 dl_tensor = &dlm_tensor.dl_tensor 

327 is_readonly = False 

328 used_name = DLPACK_TENSOR_USED_NAME 

329 else: 

330 assert False 

331  

332 cpython.PyCapsule_SetName(capsule, used_name) 

333  

334 cdef StridedMemoryView buf = StridedMemoryView() if view is None else view 

335 buf.dl_tensor = dl_tensor 

336 buf.metadata = capsule 

337 buf.ptr = <intptr_t>(dl_tensor.data) 

338 buf.device_id = device_id 

339 buf.is_device_accessible = is_device_accessible 

340 buf.readonly = is_readonly 

341 buf.exporting_obj = obj 

342  

343 return buf 

344  

345  

346cdef object dtype_dlpack_to_numpy(DLDataType* dtype): 

347 cdef int bits = dtype.bits 

348 if dtype.lanes != 1: 

349 # TODO: return a NumPy structured dtype? 

350 raise NotImplementedError( 

351 f'vector dtypes (lanes={dtype.lanes}) is not supported') 

352 if dtype.code == kDLUInt: 

353 if bits == 8: 

354 np_dtype = numpy.uint8 

355 elif bits == 16: 

356 np_dtype = numpy.uint16 

357 elif bits == 32: 

358 np_dtype = numpy.uint32 

359 elif bits == 64: 

360 np_dtype = numpy.uint64 

361 else: 

362 raise TypeError('uint{} is not supported.'.format(bits)) 

363 elif dtype.code == kDLInt: 

364 if bits == 8: 

365 np_dtype = numpy.int8 

366 elif bits == 16: 

367 np_dtype = numpy.int16 

368 elif bits == 32: 

369 np_dtype = numpy.int32 

370 elif bits == 64: 

371 np_dtype = numpy.int64 

372 else: 

373 raise TypeError('int{} is not supported.'.format(bits)) 

374 elif dtype.code == kDLFloat: 

375 if bits == 16: 

376 np_dtype = numpy.float16 

377 elif bits == 32: 

378 np_dtype = numpy.float32 

379 elif bits == 64: 

380 np_dtype = numpy.float64 

381 else: 

382 raise TypeError('float{} is not supported.'.format(bits)) 

383 elif dtype.code == kDLComplex: 

384 # TODO(leofang): support complex32 

385 if bits == 64: 

386 np_dtype = numpy.complex64 

387 elif bits == 128: 

388 np_dtype = numpy.complex128 

389 else: 

390 raise TypeError('complex{} is not supported.'.format(bits)) 

391 elif dtype.code == kDLBool: 

392 if bits == 8: 

393 np_dtype = numpy.bool_ 

394 else: 

395 raise TypeError(f'{bits}-bit bool is not supported') 

396 elif dtype.code == kDLBfloat: 

397 # TODO(leofang): use ml_dtype.bfloat16? 

398 raise NotImplementedError('bfloat is not supported yet') 

399 else: 

400 raise TypeError('Unsupported dtype. dtype code: {}'.format(dtype.code)) 

401  

402 # We want the dtype object not just the type object 

403 return numpy.dtype(np_dtype) 

404  

405  

406cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): 

407 cdef dict cai_data = obj.__cuda_array_interface__ 

408 if cai_data["version"] < 3: 

409 raise BufferError("only CUDA Array Interface v3 or above is supported") 

410 if cai_data.get("mask") is not None: 

411 raise BufferError("mask is not supported") 

412 if stream_ptr is None: 

413 raise BufferError("stream=None is ambiguous with view()") 

414  

415 cdef StridedMemoryView buf = StridedMemoryView() if view is None else view 

416 buf.exporting_obj = obj 

417 buf.metadata = cai_data 

418 buf.dl_tensor = NULL 

419 buf.ptr, buf.readonly = cai_data["data"] 

420 buf.is_device_accessible = True 

421 buf.device_id = handle_return( 

422 driver.cuPointerGetAttribute( 

423 driver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, 

424 buf.ptr)) 

425  

426 cdef intptr_t producer_s, consumer_s 

427 stream_ptr = int(stream_ptr) 

428 if stream_ptr != -1: 

429 stream = cai_data.get("stream") 

430 if stream is not None: 

431 producer_s = <intptr_t>(stream) 

432 consumer_s = <intptr_t>(stream_ptr) 

433 assert producer_s > 0 

434 # establish stream order 

435 if producer_s != consumer_s: 

436 e = handle_return(driver.cuEventCreate( 

437 driver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) 

438 handle_return(driver.cuEventRecord(e, producer_s)) 

439 handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0)) 

440 handle_return(driver.cuEventDestroy(e)) 

441  

442 return buf 

443  

444  

445def args_viewable_as_strided_memory(tuple arg_indices): 

446 """ 

447 Decorator to create proxy objects to :obj:`StridedMemoryView` for the 

448 specified positional arguments. 

449  

450 This allows array/tensor attributes to be accessed inside the function 

451 implementation, while keeping the function body array-library-agnostic (if 

452 desired). 

453  

454 Inside the decorated function, the specified arguments become instances 

455 of an (undocumented) proxy type, regardless of its original source. A 

456 :obj:`StridedMemoryView` instance can be obtained by passing the (consumer) 

457 stream pointer (as a Python `int`) to the proxies's ``view()`` method. For 

458 example: 

459  

460 .. code-block:: python 

461  

462 @args_viewable_as_strided_memory((1,)) 

463 def my_func(arg0, arg1, arg2, stream: Stream): 

464 # arg1 can be any object supporting DLPack or CUDA Array Interface 

465 view = arg1.view(stream.handle) 

466 assert isinstance(view, StridedMemoryView) 

467 ... 

468  

469 Parameters 

470 ---------- 

471 arg_indices : tuple 

472 The indices of the target positional arguments. 

473 """ 

474 def wrapped_func_with_indices(func): 

475 @functools.wraps(func) 

476 def wrapped_func(*args, **kwargs): 

477 args = list(args) 

478 cdef int idx 

479 for idx in arg_indices: 

480 args[idx] = _StridedMemoryViewProxy(args[idx]) 

481 return func(*args, **kwargs) 

482 return wrapped_func 

483 return wrapped_func_with_indices