Coverage for cuda/core/texture/_array.pyx: 90.87%

208 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-03 01:38 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from __future__ import annotations 

6  

7cimport cpython 

8from libc.stdint cimport intptr_t 

9from libc.string cimport memset 

10  

11from cuda.bindings cimport cydriver 

12from cuda.core._memory._buffer cimport Buffer 

13from cuda.core._resource_handles cimport ( 

14 OpaqueArrayHandle, 

15 as_cu, 

16 as_intptr, 

17 create_array_handle, 

18 create_array_handle_owning, 

19 create_array_handle_ref, 

20 get_last_error, 

21) 

22from cuda.core._stream cimport Stream, Stream_accept 

23from cuda.core._utils.cuda_utils cimport ( 

24 HANDLE_RETURN, 

25 _get_current_device_id, 

26) 

27  

28from enum import IntEnum 

29  

30  

31class ArrayFormat(IntEnum): 

32 """Element format for a :class:`OpaqueArray` allocation. 

33  

34 Mirrors ``CUarray_format`` from the CUDA driver API. 

35 """ 

36 UINT8 = cydriver.CU_AD_FORMAT_UNSIGNED_INT8 

37 UINT16 = cydriver.CU_AD_FORMAT_UNSIGNED_INT16 

38 UINT32 = cydriver.CU_AD_FORMAT_UNSIGNED_INT32 

39 INT8 = cydriver.CU_AD_FORMAT_SIGNED_INT8 

40 INT16 = cydriver.CU_AD_FORMAT_SIGNED_INT16 

41 INT32 = cydriver.CU_AD_FORMAT_SIGNED_INT32 

42 FLOAT16 = cydriver.CU_AD_FORMAT_HALF 

43 FLOAT32 = cydriver.CU_AD_FORMAT_FLOAT 

44  

45  

46# Bytes per element (single channel) for each format. 

47_FORMAT_ELEM_SIZE = { 

48 int(ArrayFormat.UINT8): 1, 

49 int(ArrayFormat.INT8): 1, 

50 int(ArrayFormat.UINT16): 2, 

51 int(ArrayFormat.INT16): 2, 

52 int(ArrayFormat.FLOAT16): 2, 

53 int(ArrayFormat.UINT32): 4, 

54 int(ArrayFormat.INT32): 4, 

55 int(ArrayFormat.FLOAT32): 4, 

56} 

57  

58  

59def _validate_format_channels(format, num_channels): 

60 """Validate the ``(format, num_channels)`` pair shared by the array, 

61 mipmap, and texture factories. Raises on an invalid combination.""" 

62 if not isinstance(format, ArrayFormat): 1nopqrgibjfedOEDNFaGCHBcPSIJTQUVWXKRYZ01ls2Lmt3M4Auvwxyzk

63 raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}") 1OSY

64 if isinstance(num_channels, bool) or num_channels not in (1, 2, 4): 1nopqrgibjfedEDNFaGCHBcPIJTQUVWXKRZ01ls2Lmt3M4Auvwxyzk

65 raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}") 1NPQR

66  

67  

68def _validate_array_shape(shape): 

69 """Coerce ``shape`` to a tuple of ints and validate rank (1-3) and that 

70 every extent is >= 1. Returns the normalized tuple.""" 

71 try: 1nopqrgibjfedEDFaGCHBcIJKlsLmtMAuvwxyzk

72 shape_t = tuple(int(s) for s in shape) 1nopqrgibjfedEDFaGCHBcIJKlsLmtMAuvwxyzk

73 except TypeError as e: 1E

74 raise TypeError(f"shape must be a tuple of ints, got {type(shape).__name__}") from e 1E

75 if not 1 <= len(shape_t) <= 3: 1nopqrgibjfedDFaGCHBcIJKlsLmtMAuvwxyzk

76 raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}") 1F

77 for i, dim in enumerate(shape_t): 1nopqrgibjfedDaGCHBcIJKlsLmtMAuvwxyzk

78 if dim < 1: 1nopqrgibjfedDaGCHBcIJKlsLmtMAuvwxyzk

79 raise ValueError(f"shape[{i}] must be >= 1, got {dim}") 1DI

80 return shape_t 1nopqrgibjfedaGCHBcJKlsLmtMAuvwxyzk

81  

82  

83cdef void _fill_array_endpoint( 

84 cydriver.CUDA_MEMCPY3D* p, OpaqueArray arr, bint is_src 

85) noexcept: 

86 """Populate the src or dst array fields of a CUDA_MEMCPY3D struct.""" 

87 if is_src: 1bfedac

88 p.srcMemoryType = cydriver.CU_MEMORYTYPE_ARRAY 1bfedac

89 p.srcArray = as_cu(arr._handle) 1bfedac

90 p.srcXInBytes = 0 1bfedac

91 p.srcY = 0 1bfedac

92 p.srcZ = 0 1bfedac

93 else: 

94 p.dstMemoryType = cydriver.CU_MEMORYTYPE_ARRAY 1bac

95 p.dstArray = as_cu(arr._handle) 1bac

96 p.dstXInBytes = 0 1bac

97 p.dstY = 0 1bac

98 p.dstZ = 0 1bac

99  

100  

101cdef int _fill_host_endpoint( 

102 cydriver.CUDA_MEMCPY3D* p, 

103 object obj, 

104 bint is_src, 

105 size_t width_bytes, 

106 size_t height, 

107 size_t required, 

108 cpython.Py_buffer* pybuf_out, 

109) except -1: 

110 """Populate src/dst host fields from a buffer-protocol ``obj``. 

111  

112 Acquires a Py_buffer view; the caller is responsible for releasing it 

113 (this function always returns with the view held when it returns 1). 

114 """ 

115 cdef int flags = cpython.PyBUF_SIMPLE 1hedac

116 if not is_src: 1edac

117 flags |= cpython.PyBUF_WRITABLE 1edac

118 if cpython.PyObject_GetBuffer(obj, pybuf_out, flags) != 0: 1edac

119 raise TypeError( 

120 f"Source/destination must be a Buffer or a contiguous " 

121 f"buffer-protocol object, got {type(obj).__name__}" 

122 ) 

123 if <size_t>pybuf_out.len < required: 1edac

124 cpython.PyBuffer_Release(pybuf_out) 1e

125 raise ValueError( 1e

126 f"Host buffer has {pybuf_out.len} bytes, smaller than the array " 1e

127 f"extent ({required} bytes)" 1e

128 ) 

129 if is_src: 1dac

130 p.srcMemoryType = cydriver.CU_MEMORYTYPE_HOST 1ac

131 p.srcHost = pybuf_out.buf 1ac

132 p.srcPitch = width_bytes 1ac

133 p.srcHeight = height 1hac

134 p.srcXInBytes = 0 1ac

135 p.srcY = 0 1ac

136 p.srcZ = 0 1ac

137 else: 

138 p.dstMemoryType = cydriver.CU_MEMORYTYPE_HOST 1dac

139 p.dstHost = pybuf_out.buf 1dac

140 p.dstPitch = width_bytes 1dac

141 p.dstHeight = height 1dac

142 p.dstXInBytes = 0 1dac

143 p.dstY = 0 1dac

144 p.dstZ = 0 1dac

145 return 1 1dac

146  

147  

148cdef int _fill_linear_endpoint( 

149 cydriver.CUDA_MEMCPY3D* p, 

150 object obj, 

151 bint is_src, 

152 size_t width_bytes, 

153 size_t height, 

154 size_t depth, 

155 cpython.Py_buffer* pybuf_out, 

156) except -1: 

157 """Populate the src or dst linear fields. Returns 1 if pybuf_out was 

158 filled (caller must release it), 0 otherwise. 

159 """ 

160 cdef intptr_t ptr 

161 cdef size_t required = width_bytes * height * depth 1bfedac

162 if isinstance(obj, Buffer): 1bfedac

163 if <size_t>(<Buffer>obj).size < required: 1bf

164 raise ValueError( 1f

165 f"Buffer size ({(<Buffer>obj).size} bytes) is smaller than " 1f

166 f"the array extent ({required} bytes)" 1f

167 ) 

168 ptr = int((<Buffer>obj).handle) 1b

169 if is_src: 1b

170 p.srcMemoryType = cydriver.CU_MEMORYTYPE_DEVICE 1b

171 p.srcDevice = <cydriver.CUdeviceptr>ptr 1b

172 p.srcPitch = width_bytes 1b

173 p.srcHeight = height 1b

174 p.srcXInBytes = 0 1b

175 p.srcY = 0 1b

176 p.srcZ = 0 1b

177 else: 

178 p.dstMemoryType = cydriver.CU_MEMORYTYPE_DEVICE 1b

179 p.dstDevice = <cydriver.CUdeviceptr>ptr 1b

180 p.dstPitch = width_bytes 1b

181 p.dstHeight = height 1b

182 p.dstXInBytes = 0 1b

183 p.dstY = 0 1b

184 p.dstZ = 0 1b

185 return 0 1b

186 return _fill_host_endpoint( 1edac

187 p, obj, is_src, width_bytes, height, required, pybuf_out 

188 ) 

189  

190  

191cdef _copy3d(OpaqueArray arr, object other, Stream stream, bint to_array): 

192 """Issue a full-array async 3D memcpy between ``arr`` and ``other``. 

193  

194 Direction is determined by ``to_array``: True copies *into* arr, False 

195 copies *out of* arr. ``stream`` must already be a concrete :class:`Stream` 

196 (callers coerce via :func:`Stream_accept`). 

197 """ 

198 cdef cydriver.CUDA_MEMCPY3D params 

199 cdef cpython.Py_buffer pybuf 

200 cdef int got_buffer = 0 1bfedac

201 cdef intptr_t stream_handle 

202 cdef cydriver.CUstream c_stream 

203  

204 memset(&params, 0, sizeof(params)) 1bfedac

205 width_bytes, height, depth = arr._extent_bytes() 1bfedac

206 params.WidthInBytes = <size_t>width_bytes 1bfedac

207 params.Height = <size_t>height 1bfedac

208 params.Depth = <size_t>depth 1bfedac

209  

210 try: 1bfedac

211 if to_array: 1bfedac

212 got_buffer = _fill_linear_endpoint( 1bfeac

213 &params, other, True, width_bytes, height, depth, &pybuf 1bfeac

214 ) 

215 _fill_array_endpoint(&params, arr, False) 1bac

216 else: 

217 _fill_array_endpoint(&params, arr, True) 1bfedac

218 got_buffer = _fill_linear_endpoint( 1bfedac

219 &params, other, False, width_bytes, height, depth, &pybuf 1bfedac

220 ) 

221  

222 stream_handle = int((<Stream>stream).handle) 1bdac

223 c_stream = <cydriver.CUstream><void*>stream_handle 1bdac

224 with nogil: 1bdac

225 HANDLE_RETURN(cydriver.cuMemcpy3DAsync(&params, c_stream)) 1bdac

226 finally: 

227 if got_buffer: 1bdac

228 cpython.PyBuffer_Release(&pybuf) 1dac

229  

230  

231cdef class OpaqueArray: 

232 """An opaque, hardware-laid-out GPU allocation for texture/surface access. 

233  

234 Distinct from :class:`Buffer`: a ``CUarray`` has no exposed device pointer 

235 and can only be accessed from kernels through a :class:`TextureObject` or 

236 :class:`SurfaceObject`. Its memory layout is chosen by the driver for 2D/3D 

237 spatial locality. 

238  

239 **Copy-only interop.** Because the layout is opaque and there is no linear 

240 device pointer, a ``OpaqueArray`` cannot expose ``__cuda_array_interface__`` / 

241 DLPack and cannot be shared zero-copy with NumPy, CuPy, numba-cuda, or 

242 PyTorch. Moving data in or out is therefore always a copy: use 

243 :meth:`copy_from` / :meth:`copy_to` against a linear :class:`Buffer` or a 

244 host buffer-protocol object. There is no allocation helper — allocate the 

245 linear :class:`Buffer` yourself (e.g. ``mr.allocate(arr.size_bytes, 

246 stream=s)``) and copy. 

247  

248 Construct via :meth:`from_descriptor`. Only plain 1D/2D/3D allocations are 

249 supported in this initial version; layered/cubemap/sparse variants will 

250 follow once their shape semantics are settled. 

251 """ 

252  

253 def __init__(self, *args, **kwargs): 

254 raise RuntimeError( 15

255 "OpaqueArray cannot be instantiated directly. Use OpaqueArray.from_descriptor()." 

256 ) 

257  

258 @classmethod 

259 def from_descriptor(cls, *, shape, format, num_channels, is_surface_load_store=False): 

260 """Allocate a new CUDA array. 

261  

262 Parameters 

263 ---------- 

264 shape : tuple of int 

265 ``(width,)``, ``(width, height)``, or ``(width, height, depth)`` 

266 in elements. 

267 format : ArrayFormat 

268 Element format. 

269 num_channels : int 

270 Channels per element. Must be 1, 2, or 4. 

271 is_surface_load_store : bool 

272 If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so the array 

273 can be bound as a :class:`SurfaceObject` for kernel-side writes. 

274 Default False. 

275  

276 Returns 

277 ------- 

278 OpaqueArray 

279 """ 

280 _validate_format_channels(format, num_channels) 1nopqrgibjfedOEDNFalsmtAuvwxyzk

281 shape_t = _validate_array_shape(shape) 1nopqrgibjfedEDFalsmtAuvwxyzk

282  

283 cdef cydriver.CUarray_format c_format = <cydriver.CUarray_format><int>format 1nopqrgibjfedalsmtAuvwxyzk

284 cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d 

285 cdef int rank = len(shape_t) 1nopqrgibjfedalsmtAuvwxyzk

286 cdef unsigned int flags = ( 

287 cydriver.CUDA_ARRAY3D_SURFACE_LDST if is_surface_load_store else 0 1nopqrgibjfedalsmtAuvwxyzk

288 ) 

289  

290 # cuArray3DCreate handles 1D/2D/3D uniformly (Height/Depth 0 sentinels), 

291 # so a single descriptor + create_array_handle covers every shape. 

292 memset(&desc3d, 0, sizeof(desc3d)) 1nopqrgibjfedalsmtAuvwxyzk

293 desc3d.Width = <size_t>shape_t[0] 1nopqrgibjfedalsmtAuvwxyzk

294 desc3d.Height = <size_t>(shape_t[1] if rank >= 2 else 0) 1nopqrgibjfedalsmtAuvwxyzk

295 desc3d.Depth = <size_t>(shape_t[2] if rank >= 3 else 0) 1nopqrgibjfedalsmtAuvwxyzk

296 desc3d.Format = c_format 1nopqrgibjfedalsmtAuvwxyzk

297 desc3d.NumChannels = <unsigned int>num_channels 1nopqrgibjfedalsmtAuvwxyzk

298 desc3d.Flags = flags 1nopqrgibjfedalsmtAuvwxyzk

299  

300 cdef OpaqueArrayHandle h = create_array_handle(desc3d) 1nopqrgibjfedalsmtAuvwxyzk

301 if not h: 1nopqrgibjfedalsmtAuvwxyzk

302 HANDLE_RETURN(get_last_error()) 

303  

304 cdef OpaqueArray self = cls.__new__(cls) 1nopqrgibjfedalsmtAuvwxyzk

305 self._handle = h 1nopqrgibjfedalsmtAuvwxyzk

306 self._shape = shape_t 1nopqrgibjfedalsmtAuvwxyzk

307 self._format = c_format 1nopqrgibjfedalsmtAuvwxyzk

308 self._num_channels = num_channels 1nopqrgibjfedalsmtAuvwxyzk

309 self._surface_load_store = bool(is_surface_load_store) 1nopqrgibjfedalsmtAuvwxyzk

310 self._device_id = _get_current_device_id() 1nopqrgibjfedalsmtAuvwxyzk

311 return self 1nopqrgibjfedalsmtAuvwxyzk

312  

313 @classmethod 

314 def _from_handle(cls, intptr_t handle, bint owning, *, device_id=None): 

315 """Wrap an externally-allocated ``CUarray``. 

316  

317 Intended for graphics interop (``cuGraphicsSubResourceGetMappedArray``) 

318 where the array is owned by the graphics API. With ``owning=False`` the 

319 underlying ``CUarray`` is never destroyed by this object. Shape, format, 

320 and channel count are queried from the driver. 

321 """ 

322 cdef cydriver.CUarray raw = <cydriver.CUarray><void*>handle 

323 cdef OpaqueArrayHandle h 

324 if owning: 

325 h = create_array_handle_owning(raw) 

326 else: 

327 h = create_array_handle_ref(raw) 

328 cdef int dev = _get_current_device_id() if device_id is None else int(device_id) 

329 return _array_from_handle(h, dev) 

330  

331 @property 

332 def handle(self): 

333 """The underlying ``CUarray`` as an integer.""" 

334 return as_intptr(self._handle) 1gBck

335  

336 @property 

337 def shape(self): 

338 """Allocation shape, in elements.""" 

339 return self._shape 1giCBc

340  

341 @property 

342 def format(self): 

343 """The element :class:`ArrayFormat`.""" 

344 return ArrayFormat(self._format) 1gB

345  

346 @property 

347 def num_channels(self): 

348 """Channels per element (1, 2, or 4).""" 

349 return self._num_channels 1gB

350  

351 @property 

352 def element_size(self): 

353 """Bytes per element (format size * channels).""" 

354 return _FORMAT_ELEM_SIZE[self._format] * self._num_channels 1gi

355  

356 @property 

357 def device(self): 

358 """The :class:`Device` this array was allocated on.""" 

359 from cuda.core._device import Device 1g

360 return Device(self._device_id) 1g

361  

362 @property 

363 def is_surface_load_store(self): 

364 """True if this array was created with ``CUDA_ARRAY3D_SURFACE_LDST`` 

365 and can be bound as a :class:`SurfaceObject`.""" 

366 return self._surface_load_store 1gilsmk

367  

368 def _extent_bytes(self): 

369 """Return (width_bytes, height, depth) for cuMemcpy3D, with height/depth 

370 normalized to >=1 for lower-rank arrays.""" 

371 cdef int rank = len(self._shape) 1bfedac

372 cdef size_t w = <size_t>self._shape[0] * <size_t>( 1bfedac

373 _FORMAT_ELEM_SIZE[self._format] * self._num_channels 1bfedac

374 ) 

375 cdef size_t h = <size_t>(self._shape[1] if rank >= 2 else 1) 1bfedac

376 cdef size_t d = <size_t>(self._shape[2] if rank >= 3 else 1) 1bfedac

377 return w, h, d 1bfedac

378  

379 def copy_from(self, src, *, stream) -> None: 

380 """Copy a full-array's worth of data into this array. 

381  

382 Parameters 

383 ---------- 

384 src : Buffer or buffer-protocol object 

385 Source data. Must contain at least ``self.size_bytes`` bytes 

386 of contiguous data. 

387 stream : Stream or GraphBuilder 

388 Stream to issue the copy on. A :class:`~cuda.core.graph.GraphBuilder` 

389 is accepted so the copy can be captured into a graph. 

390 """ 

391 _copy3d(self, src, Stream_accept(stream), to_array=True) 1bjfeac

392  

393 def copy_to(self, dst, *, stream): 

394 """Copy a full-array's worth of data out of this array. 

395  

396 Parameters 

397 ---------- 

398 dst : Buffer or writable buffer-protocol object 

399 Destination. Must have at least ``self.size_bytes`` bytes of 

400 writable, contiguous space. 

401 stream : Stream or GraphBuilder 

402 Stream to issue the copy on. A :class:`~cuda.core.graph.GraphBuilder` 

403 is accepted so the copy can be captured into a graph. 

404  

405 Returns 

406 ------- 

407 The ``dst`` object, for parity with :meth:`Buffer.copy_to`. 

408 """ 

409 _copy3d(self, dst, Stream_accept(stream), to_array=False) 1bjfedac

410 return dst 1bdac

411  

412 @property 

413 def size_bytes(self): 

414 """Total bytes of array storage (``prod(shape) * element_size``).""" 

415 cdef size_t n = 1 1gb

416 for s in self._shape: 1gb

417 n *= <size_t>s 1gb

418 return n * <size_t>(_FORMAT_ELEM_SIZE[self._format] * self._num_channels) 1gb

419  

420 cpdef close(self): 

421 """Release this object's reference to the underlying ``CUarray``. 

422  

423 Destruction (``cuArrayDestroy``) happens via the handle's deleter when 

424 the last reference is dropped; for a non-owning handle (graphics interop 

425 or a mipmap-level view) nothing is destroyed. Idempotent: a second call 

426 (or destruction after ``close()``) is a no-op. 

427 """ 

428 self._handle.reset() 1nopqrgibjfedaCBclmtuvwxyzk

429  

430 def __enter__(self): 

431 return self 

432  

433 def __exit__(self, exc_type, exc, tb): 

434 self.close() 

435  

436 def __repr__(self): 

437 return ( 

438 f"OpaqueArray(shape={self._shape}, " 

439 f"format={ArrayFormat(self._format).name}, " 

440 f"num_channels={self._num_channels})" 

441 ) 

442  

443  

444cdef OpaqueArray _array_from_handle(OpaqueArrayHandle h, int device_id): 

445 """Wrap an existing OpaqueArrayHandle as a OpaqueArray, querying the driver for the 

446 array's shape/format/channels/surface-flag metadata. 

447  

448 Any owning/non-owning semantics and parent (mipmap) dependency are already 

449 captured structurally inside ``h``'s C++ box. 

450 """ 

451 if not h: 1CBc

452 HANDLE_RETURN(get_last_error()) 

453  

454 cdef OpaqueArray self = OpaqueArray.__new__(OpaqueArray) 1CBc

455 self._handle = h 1CBc

456 self._device_id = device_id 1CBc

457  

458 cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc 

459 cdef cydriver.CUarray raw = as_cu(h) 1CBc

460 with nogil: 1CBc

461 HANDLE_RETURN(cydriver.cuArray3DGetDescriptor(&desc, raw)) 1CBc

462  

463 if desc.Depth > 0: 1CBc

464 self._shape = (int(desc.Width), int(desc.Height), int(desc.Depth)) 

465 elif desc.Height > 0: 1CBc

466 self._shape = (int(desc.Width), int(desc.Height)) 1CBc

467 else: 

468 self._shape = (int(desc.Width),) 

469 self._format = desc.Format 1CBc

470 self._num_channels = desc.NumChannels 1CBc

471 self._surface_load_store = bool(desc.Flags & cydriver.CUDA_ARRAY3D_SURFACE_LDST) 1CBc

472 return self 1CBc