Coverage for cuda/core/experimental/

3# SPDX-License-Identifier: Apache-2.0

5from ._dlpack cimport *

7import functools

8import warnings

9from typing import Optional

11import numpy

13from cuda.core.experimental._utils.cuda_utils import handle_return, driver

14from cuda.core.experimental._utils cimport cuda_utils

17# TODO(leofang): support NumPy structured dtypes

20cdef class StridedMemoryView:

21 """A dataclass holding metadata of a strided dense array/tensor.

23 A :obj:`StridedMemoryView` instance can be created in two ways:

25 1. Using the :obj:`args_viewable_as_strided_memory` decorator (recommended)

26 2. Explicit construction, see below

28 This object supports both DLPack (up to v1.0) and CUDA Array Interface

29 (CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol

30 first, then the CAI protocol. A :obj:`BufferError` is raised if neither is

31 supported.

33 Since either way would take a consumer stream, for DLPack it is passed to

34 ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a

35 stream order will be established between the consumer stream and the

36 producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if

37 ``cudaStreamWaitEvent`` is called by this method.

39 To opt-out of the stream ordering operation in either DLPack or CAI,

40 please pass ``stream_ptr=-1``. Note that this deviates (on purpose)

41 from the semantics of ``obj.__dlpack__(stream=None, ...)`` since ``cuda.core``

42 does not encourage using the (legacy) default/null stream, but is

43 consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be

44 internally passed to ``obj.__dlpack__()`` instead.

46 Attributes

47 ----------

48 ptr : int

49 Pointer to the tensor buffer (as a Python `int`).

50 shape : tuple

51 Shape of the tensor.

52 strides : Optional[tuple]

53 Strides of the tensor (in **counts**, not bytes).

54 dtype: numpy.dtype

55 Data type of the tensor.

56 device_id : int

57 The device ID for where the tensor is located. It is -1 for CPU tensors

58 (meaning those only accessible from the host).

59 is_device_accessible : bool

60 Whether the tensor data can be accessed on the GPU.

61 readonly: bool

62 Whether the tensor data can be modified in place.

63 exporting_obj : Any

64 A reference to the original tensor object that is being viewed.

66 Parameters

67 ----------

68 obj : Any

69 Any objects that supports either DLPack (up to v1.0) or CUDA Array

70 Interface (v3).

71 stream_ptr: int

72 The pointer address (as Python `int`) to the **consumer** stream.

73 Stream ordering will be properly established unless ``-1`` is passed.

74 """

75 cdef readonly:

76 intptr_t ptr

77 int device_id

78 bint is_device_accessible

79 bint readonly

80 object exporting_obj

82 cdef:

83 # If using dlpack, this is a strong reference to the result of

84 # obj.__dlpack__() so we can lazily create shape and strides from

85 # it later. If using CAI, this is a reference to the source

86 # `__cuda_array_interface__` object.

87 object metadata

89 # The tensor object if has obj has __dlpack__, otherwise must be NULL

90 DLTensor *dl_tensor

92 # Memoized properties

93 tuple _shape

94 tuple _strides

95 # a `None` value for _strides has defined meaning in dlpack and

96 # the cuda array interface, meaning C order, contiguous.

97 #

98 # this flag helps prevent unnecessary recompuation of _strides

99 bint _strides_init

100 object _dtype

101

102 def __init__(self, obj: object = None, stream_ptr: int | None = None) -> None:

103 cdef str clsname = self.__class__.__name__

104 if obj is not None:

105 # populate self's attributes

106 if check_has_dlpack(obj):

107 warnings.warn(

108 f"Constructing a {clsname} directly from a DLPack-supporting object is deprecated; "

109 "Use `StridedMemoryView.from_dlpack` or `StridedMemoryView.from_any_interface` instead.",

110 DeprecationWarning,

111 stacklevel=2,

112 )

113 view_as_dlpack(obj, stream_ptr, self)

114 else:

115 warnings.warn(

116 f"Constructing a {clsname} directly from a CUDA-array-interface-supporting object is deprecated; "

117 "Use `StridedMemoryView.from_cuda_array_interface` or `StridedMemoryView.from_any_interface` instead.",

118 DeprecationWarning,

119 stacklevel=2,

120 )

121 view_as_cai(obj, stream_ptr, self)

122 else:

123 warnings.warn(

124 f"Constructing an empty {clsname} is deprecated; "

125 "use one of the classmethods `from_dlpack`, `from_cuda_array_interface` or `from_any_interface` "

126 "to construct a StridedMemoryView from an object",

127 DeprecationWarning,

128 stacklevel=2,

129 )

130

131 @classmethod

132 def from_dlpack(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView:

133 cdef StridedMemoryView buf

134 with warnings.catch_warnings():

135 warnings.simplefilter("ignore")

136 buf = cls()

137 view_as_dlpack(obj, stream_ptr, buf)

138 return buf

139

140 @classmethod

141 def from_cuda_array_interface(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView:

142 cdef StridedMemoryView buf

143 with warnings.catch_warnings():

144 warnings.simplefilter("ignore")

145 buf = cls()

146 view_as_cai(obj, stream_ptr, buf)

147 return buf

148

149 @classmethod

150 def from_any_interface(cls, obj: object, stream_ptr: int | None = None) -> StridedMemoryView:

151 if check_has_dlpack(obj):

152 return cls.from_dlpack(obj, stream_ptr)

153 return cls.from_cuda_array_interface(obj, stream_ptr)

154

155 def __dealloc__(self):

156 if self.dl_tensor == NULL:

157 return

158

159 if cpython.PyCapsule_IsValid(

160 self.metadata, DLPACK_VERSIONED_TENSOR_USED_NAME):

161 data = cpython.PyCapsule_GetPointer(

162 self.metadata, DLPACK_VERSIONED_TENSOR_USED_NAME)

163 dlm_tensor_ver = <DLManagedTensorVersioned*>data

164 dlm_tensor_ver.deleter(dlm_tensor_ver)

165 elif cpython.PyCapsule_IsValid(

166 self.metadata, DLPACK_TENSOR_USED_NAME):

167 data = cpython.PyCapsule_GetPointer(

168 self.metadata, DLPACK_TENSOR_USED_NAME)

169 dlm_tensor = <DLManagedTensor*>data

170 dlm_tensor.deleter(dlm_tensor)

171

172 @property

173 def shape(self) -> tuple[int, ...]:

174 if self._shape is None:

175 if self.exporting_obj is not None:

176 if self.dl_tensor != NULL:

177 self._shape = cuda_utils.carray_int64_t_to_tuple(

178 self.dl_tensor.shape,

179 self.dl_tensor.ndim

180 )

181 else:

182 self._shape = self.metadata["shape"]

183 else:

184 self._shape = ()

185 return self._shape

186

187 @property

188 def strides(self) -> Optional[tuple[int, ...]]:

189 cdef int itemsize

190 if self._strides_init is False:

191 if self.exporting_obj is not None:

192 if self.dl_tensor != NULL:

193 if self.dl_tensor.strides:

194 self._strides = cuda_utils.carray_int64_t_to_tuple(

195 self.dl_tensor.strides,

196 self.dl_tensor.ndim

197 )

198 else:

199 # This is a Python interface anyway, so not much point

200 # to using the optimization in cuda_utils.carray_int64_t_to_tuple

201 strides = self.metadata.get("strides")

202 if strides is not None:

203 itemsize = self.dtype.itemsize

204 self._strides = tuple(x // itemsize for x in strides)

205 self._strides_init = True

206 return self._strides

207

208 @property

209 def dtype(self) -> Optional[numpy.dtype]:

210 if self._dtype is None:

211 if self.exporting_obj is not None:

212 if self.dl_tensor != NULL:

213 self._dtype = dtype_dlpack_to_numpy(&self.dl_tensor.dtype)

214 else:

215 # TODO: this only works for built-in numeric types

216 self._dtype = numpy.dtype(self.metadata["typestr"])

217 return self._dtype

218

219 def __repr__(self):

220 return (f"StridedMemoryView(ptr={self.ptr},\n"

221 + f" shape={self.shape},\n"

222 + f" strides={self.strides},\n"

223 + f" dtype={get_simple_repr(self.dtype)},\n"

224 + f" device_id={self.device_id},\n"

225 + f" is_device_accessible={self.is_device_accessible},\n"

226 + f" readonly={self.readonly},\n"

227 + f" exporting_obj={get_simple_repr(self.exporting_obj)})")

228

229

230cdef str get_simple_repr(obj):

231 # TODO: better handling in np.dtype objects

232 cdef object obj_class

233 cdef str obj_repr

234 if isinstance(obj, type):

235 obj_class = obj

236 else:

237 obj_class = obj.__class__

238 if obj_class.__module__ in (None, "builtins"):

239 obj_repr = obj_class.__name__

240 else:

241 obj_repr = f"{obj_class.__module__}.{obj_class.__name__}"

242 return obj_repr

243

244

245

246cdef bint check_has_dlpack(obj) except*:

247 cdef bint has_dlpack

248 if hasattr(obj, "__dlpack__") and hasattr(obj, "__dlpack_device__"):

249 has_dlpack = True

250 elif hasattr(obj, "__cuda_array_interface__"):

251 has_dlpack = False

252 else:

253 raise RuntimeError(

254 "the input object does not support any data exchange protocol")

255 return has_dlpack

256

257

258cdef class _StridedMemoryViewProxy:

259 cdef readonly:

260 object obj

261 bint has_dlpack

262

263 def __init__(self, obj):

264 self.obj = obj

265 self.has_dlpack = check_has_dlpack(obj)

266

267 cpdef StridedMemoryView view(self, stream_ptr=None):

268 if self.has_dlpack:

269 return StridedMemoryView.from_dlpack(self.obj, stream_ptr)

270 else:

271 return StridedMemoryView.from_cuda_array_interface(self.obj, stream_ptr)

272

273

274cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):

275 cdef int dldevice, device_id

276 cdef bint is_device_accessible, is_readonly

277 is_device_accessible = False

278 dldevice, device_id = obj.__dlpack_device__()

279 if dldevice == _kDLCPU:

280 assert device_id == 0

281 device_id = -1

282 if stream_ptr is None:

283 raise BufferError("stream=None is ambiguous with view()")

284 elif stream_ptr == -1:

285 stream_ptr = None

286 elif dldevice == _kDLCUDA:

287 assert device_id >= 0

288 is_device_accessible = True

289 # no need to check other stream values, it's a pass-through

290 if stream_ptr is None:

291 raise BufferError("stream=None is ambiguous with view()")

292 elif dldevice in (_kDLCUDAHost, _kDLCUDAManaged):

293 is_device_accessible = True

294 # just do a pass-through without any checks, as pinned/managed memory can be

295 # accessed on both host and device

296 else:

297 raise BufferError("device not supported")

298

299 cdef object capsule

300 try:

301 capsule = obj.__dlpack__(

302 stream=int(stream_ptr) if stream_ptr else None,

303 max_version=(DLPACK_MAJOR_VERSION, DLPACK_MINOR_VERSION))

304 except TypeError:

305 capsule = obj.__dlpack__(

306 stream=int(stream_ptr) if stream_ptr else None)

307

308 cdef void* data = NULL

309 cdef DLTensor* dl_tensor

310 cdef DLManagedTensorVersioned* dlm_tensor_ver

311 cdef DLManagedTensor* dlm_tensor

312 cdef const char *used_name

313 if cpython.PyCapsule_IsValid(

314 capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME):

315 data = cpython.PyCapsule_GetPointer(

316 capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME)

317 dlm_tensor_ver = <DLManagedTensorVersioned*>data

318 dl_tensor = &dlm_tensor_ver.dl_tensor

319 is_readonly = bool((dlm_tensor_ver.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0)

320 used_name = DLPACK_VERSIONED_TENSOR_USED_NAME

321 elif cpython.PyCapsule_IsValid(

322 capsule, DLPACK_TENSOR_UNUSED_NAME):

323 data = cpython.PyCapsule_GetPointer(

324 capsule, DLPACK_TENSOR_UNUSED_NAME)

325 dlm_tensor = <DLManagedTensor*>data

326 dl_tensor = &dlm_tensor.dl_tensor

327 is_readonly = False

328 used_name = DLPACK_TENSOR_USED_NAME

329 else:

330 assert False

331

332 cpython.PyCapsule_SetName(capsule, used_name)

333

334 cdef StridedMemoryView buf = StridedMemoryView() if view is None else view

335 buf.dl_tensor = dl_tensor

336 buf.metadata = capsule

337 buf.ptr = <intptr_t>(dl_tensor.data)

338 buf.device_id = device_id

339 buf.is_device_accessible = is_device_accessible

340 buf.readonly = is_readonly

341 buf.exporting_obj = obj

342

343 return buf

344

345

346cdef object dtype_dlpack_to_numpy(DLDataType* dtype):

347 cdef int bits = dtype.bits

348 if dtype.lanes != 1:

349 # TODO: return a NumPy structured dtype?

350 raise NotImplementedError(

351 f'vector dtypes (lanes={dtype.lanes}) is not supported')

352 if dtype.code == kDLUInt:

353 if bits == 8:

354 np_dtype = numpy.uint8

355 elif bits == 16:

356 np_dtype = numpy.uint16

357 elif bits == 32:

358 np_dtype = numpy.uint32

359 elif bits == 64:

360 np_dtype = numpy.uint64

361 else:

362 raise TypeError('uint{} is not supported.'.format(bits))

363 elif dtype.code == kDLInt:

364 if bits == 8:

365 np_dtype = numpy.int8

366 elif bits == 16:

367 np_dtype = numpy.int16

368 elif bits == 32:

369 np_dtype = numpy.int32

370 elif bits == 64:

371 np_dtype = numpy.int64

372 else:

373 raise TypeError('int{} is not supported.'.format(bits))

374 elif dtype.code == kDLFloat:

375 if bits == 16:

376 np_dtype = numpy.float16

377 elif bits == 32:

378 np_dtype = numpy.float32

379 elif bits == 64:

380 np_dtype = numpy.float64

381 else:

382 raise TypeError('float{} is not supported.'.format(bits))

383 elif dtype.code == kDLComplex:

384 # TODO(leofang): support complex32

385 if bits == 64:

386 np_dtype = numpy.complex64

387 elif bits == 128:

388 np_dtype = numpy.complex128

389 else:

390 raise TypeError('complex{} is not supported.'.format(bits))

391 elif dtype.code == kDLBool:

392 if bits == 8:

393 np_dtype = numpy.bool_

394 else:

395 raise TypeError(f'{bits}-bit bool is not supported')

396 elif dtype.code == kDLBfloat:

397 # TODO(leofang): use ml_dtype.bfloat16?

398 raise NotImplementedError('bfloat is not supported yet')

399 else:

400 raise TypeError('Unsupported dtype. dtype code: {}'.format(dtype.code))

401

402 # We want the dtype object not just the type object

403 return numpy.dtype(np_dtype)

404

405

406cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):

407 cdef dict cai_data = obj.__cuda_array_interface__

408 if cai_data["version"] < 3:

409 raise BufferError("only CUDA Array Interface v3 or above is supported")

410 if cai_data.get("mask") is not None:

411 raise BufferError("mask is not supported")

412 if stream_ptr is None:

413 raise BufferError("stream=None is ambiguous with view()")

414

415 cdef StridedMemoryView buf = StridedMemoryView() if view is None else view

416 buf.exporting_obj = obj

417 buf.metadata = cai_data

418 buf.dl_tensor = NULL

419 buf.ptr, buf.readonly = cai_data["data"]

420 buf.is_device_accessible = True

421 buf.device_id = handle_return(

422 driver.cuPointerGetAttribute(

423 driver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,

424 buf.ptr))

425

426 cdef intptr_t producer_s, consumer_s

427 stream_ptr = int(stream_ptr)

428 if stream_ptr != -1:

429 stream = cai_data.get("stream")

430 if stream is not None:

431 producer_s = <intptr_t>(stream)

432 consumer_s = <intptr_t>(stream_ptr)

433 assert producer_s > 0

434 # establish stream order

435 if producer_s != consumer_s:

436 e = handle_return(driver.cuEventCreate(

437 driver.CUevent_flags.CU_EVENT_DISABLE_TIMING))

438 handle_return(driver.cuEventRecord(e, producer_s))

439 handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0))

440 handle_return(driver.cuEventDestroy(e))

441

442 return buf

443

444

445def args_viewable_as_strided_memory(tuple arg_indices):

446 """

447 Decorator to create proxy objects to :obj:`StridedMemoryView` for the

448 specified positional arguments.

449

450 This allows array/tensor attributes to be accessed inside the function

451 implementation, while keeping the function body array-library-agnostic (if

452 desired).

453

454 Inside the decorated function, the specified arguments become instances

455 of an (undocumented) proxy type, regardless of its original source. A

456 :obj:`StridedMemoryView` instance can be obtained by passing the (consumer)

457 stream pointer (as a Python `int`) to the proxies's ``view()`` method. For

458 example:

459

460 .. code-block:: python

461

462 @args_viewable_as_strided_memory((1,))

463 def my_func(arg0, arg1, arg2, stream: Stream):

464 # arg1 can be any object supporting DLPack or CUDA Array Interface

465 view = arg1.view(stream.handle)

466 assert isinstance(view, StridedMemoryView)

467 ...

468

469 Parameters

470 ----------

471 arg_indices : tuple

472 The indices of the target positional arguments.

473 """

474 def wrapped_func_with_indices(func):

475 @functools.wraps(func)

476 def wrapped_func(*args, **kwargs):

477 args = list(args)

478 cdef int idx

479 for idx in arg_indices:

480 args[idx] = _StridedMemoryViewProxy(args[idx])

481 return func(*args, **kwargs)

482 return wrapped_func

483 return wrapped_func_with_indices

Coverage for cuda / core / experimental / _memoryview.pyx: 49%

225 statements