Coverage for cuda / core / experimental / _memory / _buffer.pyx: 91%

158 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-10 01:19 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from __future__ import annotations 

6  

7from libc.stdint cimport uintptr_t, int64_t, uint64_t 

8  

9from cuda.bindings cimport cydriver 

10from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource 

11from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer 

12from cuda.core.experimental._memory cimport _ipc 

13from cuda.core.experimental._stream cimport Stream_accept, Stream 

14from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN 

15  

16import abc 

17from typing import TypeVar, Union 

18  

19from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule 

20from cuda.core.experimental._utils.cuda_utils import driver 

21  

22__all__ = ['Buffer', 'MemoryResource'] 

23  

24  

25DevicePointerT = Union[driver.CUdeviceptr, int, None] 

26""" 

27A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting 

28:attr:`Buffer.handle`. 

29""" 

30  

31cdef class Buffer: 

32 """Represent a handle to allocated memory. 

33  

34 This generic object provides a unified representation for how 

35 different memory resources are to give access to their memory 

36 allocations. 

37  

38 Support for data interchange mechanisms are provided by DLPack. 

39 """ 

40 def __cinit__(self): 

41 self._clear() 

42  

43 def _clear(self): 

44 self._ptr = 0 

45 self._size = 0 

46 self._memory_resource = None 

47 self._ipc_data = None 

48 self._ptr_obj = None 

49 self._alloc_stream = None 

50  

51 def __init__(self, *args, **kwargs): 

52 raise RuntimeError("Buffer objects cannot be instantiated directly. " 

53 "Please use MemoryResource APIs.") 

54  

55 @classmethod 

56 def _init( 

57 cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, 

58 stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None 

59 ): 

60 cdef Buffer self = Buffer.__new__(cls) 

61 self._ptr = <uintptr_t>(int(ptr)) 

62 self._ptr_obj = ptr 

63 self._size = size 

64 self._memory_resource = mr 

65 self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None 

66 self._alloc_stream = <Stream>(stream) if stream is not None else None 

67 return self 

68  

69 def __dealloc__(self): 

70 self.close(self._alloc_stream) 

71  

72 def __reduce__(self): 

73 # Must not serialize the parent's stream! 

74 return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor()) 

75  

76 @staticmethod 

77 def from_handle( 

78 ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None 

79 ) -> Buffer: 

80 """Create a new :class:`Buffer` object from a pointer. 

81  

82 Parameters 

83 ---------- 

84 ptr : :obj:`~_memory.DevicePointerT` 

85 Allocated buffer handle object 

86 size : int 

87 Memory size of the buffer 

88 mr : :obj:`~_memory.MemoryResource`, optional 

89 Memory resource associated with the buffer 

90 """ 

91 # TODO: It is better to take a stream for latter deallocation 

92 return Buffer._init(ptr, size, mr=mr) 

93  

94 @classmethod 

95 def from_ipc_descriptor( 

96 cls, mr: DeviceMemoryResource, ipc_descriptor: IPCBufferDescriptor, 

97 stream: Stream = None 

98 ) -> Buffer: 

99 """Import a buffer that was exported from another process.""" 

100 return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_descriptor, stream) 

101  

102 def get_ipc_descriptor(self) -> IPCBufferDescriptor: 

103 """Export a buffer allocated for sharing between processes.""" 

104 if self._ipc_data is None: 

105 self._ipc_data = IPCDataForBuffer(_ipc.Buffer_get_ipc_descriptor(self), False) 

106 return self._ipc_data.ipc_descriptor 

107  

108 def close(self, stream: Stream | GraphBuilder | None = None): 

109 """Deallocate this buffer asynchronously on the given stream. 

110  

111 This buffer is released back to their memory resource 

112 asynchronously on the given stream. 

113  

114 Parameters 

115 ---------- 

116 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional 

117 The stream object to use for asynchronous deallocation. If None, 

118 the behavior depends on the underlying memory resource. 

119 """ 

120 Buffer_close(self, stream) 

121  

122 def copy_to(self, dst: Buffer = None, *, stream: Stream | GraphBuilder) -> Buffer: 

123 """Copy from this buffer to the dst buffer asynchronously on the given stream. 

124  

125 Copies the data from this buffer to the provided dst buffer. 

126 If the dst buffer is not provided, then a new buffer is first 

127 allocated using the associated memory resource before the copy. 

128  

129 Parameters 

130 ---------- 

131 dst : :obj:`~_memory.Buffer` 

132 Source buffer to copy data from 

133 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` 

134 Keyword argument specifying the stream for the 

135 asynchronous copy 

136  

137 """ 

138 stream = Stream_accept(stream) 

139 cdef Stream s_stream = <Stream>stream 

140 cdef size_t src_size = self._size 

141  

142 if dst is None: 

143 if self._memory_resource is None: 

144 raise ValueError("a destination buffer must be provided (this " 

145 "buffer does not have a memory_resource)") 

146 dst = self._memory_resource.allocate(src_size, stream) 

147  

148 cdef size_t dst_size = dst._size 

149 if dst_size != src_size: 

150 raise ValueError( "buffer sizes mismatch between src and dst (sizes " 

151 f"are: src={src_size}, dst={dst_size})" 

152 ) 

153 cdef cydriver.CUstream s = s_stream._handle 

154 with nogil: 

155 HANDLE_RETURN(cydriver.cuMemcpyAsync( 

156 <cydriver.CUdeviceptr>dst._ptr, 

157 <cydriver.CUdeviceptr>self._ptr, 

158 src_size, 

159 s 

160 )) 

161 return dst 

162  

163 def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder): 

164 """Copy from the src buffer to this buffer asynchronously on the given stream. 

165  

166 Parameters 

167 ---------- 

168 src : :obj:`~_memory.Buffer` 

169 Source buffer to copy data from 

170 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` 

171 Keyword argument specifying the stream for the 

172 asynchronous copy 

173  

174 """ 

175 stream = Stream_accept(stream) 

176 cdef Stream s_stream = <Stream>stream 

177 cdef size_t dst_size = self._size 

178 cdef size_t src_size = src._size 

179  

180 if src_size != dst_size: 

181 raise ValueError( "buffer sizes mismatch between src and dst (sizes " 

182 f"are: src={src_size}, dst={dst_size})" 

183 ) 

184 cdef cydriver.CUstream s = s_stream._handle 

185 with nogil: 

186 HANDLE_RETURN(cydriver.cuMemcpyAsync( 

187 <cydriver.CUdeviceptr>self._ptr, 

188 <cydriver.CUdeviceptr>src._ptr, 

189 dst_size, 

190 s 

191 )) 

192  

193 def fill(self, value: int, width: int, *, stream: Stream | GraphBuilder): 

194 """Fill this buffer with a value pattern asynchronously on the given stream. 

195  

196 Parameters 

197 ---------- 

198 value : int 

199 Integer value to fill the buffer with 

200 width : int 

201 Width in bytes for each element (must be 1, 2, or 4) 

202 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` 

203 Keyword argument specifying the stream for the asynchronous fill 

204  

205 Raises 

206 ------ 

207 ValueError 

208 If width is not 1, 2, or 4, if value is out of range for the width, 

209 or if buffer size is not divisible by width 

210  

211 """ 

212 cdef Stream s_stream = Stream_accept(stream) 

213 cdef unsigned char c_value8 

214 cdef unsigned short c_value16 

215 cdef unsigned int c_value32 

216 cdef size_t N 

217  

218 # Validate width 

219 if width not in (1, 2, 4): 

220 raise ValueError(f"width must be 1, 2, or 4, got {width}") 

221  

222 # Validate buffer size modulus. 

223 cdef size_t buffer_size = self._size 

224 if buffer_size % width != 0: 

225 raise ValueError(f"buffer size ({buffer_size}) must be divisible by width ({width})") 

226  

227 # Map width (bytes) to bitwidth and validate value 

228 cdef int bitwidth = width * 8 

229 _validate_value_against_bitwidth(bitwidth, value, is_signed=False) 

230  

231 # Validate value fits in width and perform fill 

232 cdef cydriver.CUstream s = s_stream._handle 

233 if width == 1: 

234 c_value8 = <unsigned char>value 

235 N = buffer_size 

236 with nogil: 

237 HANDLE_RETURN(cydriver.cuMemsetD8Async(<cydriver.CUdeviceptr>self._ptr, c_value8, N, s)) 

238 elif width == 2: 

239 c_value16 = <unsigned short>value 

240 N = buffer_size // 2 

241 with nogil: 

242 HANDLE_RETURN(cydriver.cuMemsetD16Async(<cydriver.CUdeviceptr>self._ptr, c_value16, N, s)) 

243 else: # width == 4 

244 c_value32 = <unsigned int>value 

245 N = buffer_size // 4 

246 with nogil: 

247 HANDLE_RETURN(cydriver.cuMemsetD32Async(<cydriver.CUdeviceptr>self._ptr, c_value32, N, s)) 

248  

249 def __dlpack__( 

250 self, 

251 *, 

252 stream: int | None = None, 

253 max_version: tuple[int, int] | None = None, 

254 dl_device: tuple[int, int] | None = None, 

255 copy: bool | None = None, 

256 ) -> TypeVar("PyCapsule"): 

257 # Note: we ignore the stream argument entirely (as if it is -1). 

258 # It is the user's responsibility to maintain stream order. 

259 if dl_device is not None: 

260 raise BufferError("Sorry, not supported: dl_device other than None") 

261 if copy is True: 

262 raise BufferError("Sorry, not supported: copy=True") 

263 if max_version is None: 

264 versioned = False 

265 else: 

266 if not isinstance(max_version, tuple) or len(max_version) != 2: 

267 raise BufferError(f"Expected max_version tuple[int, int], got {max_version}") 

268 versioned = max_version >= (1, 0) 

269 capsule = make_py_capsule(self, versioned) 

270 return capsule 

271  

272 def __dlpack_device__(self) -> tuple[int, int]: 

273 cdef bint d = self.is_device_accessible 

274 cdef bint h = self.is_host_accessible 

275 if d and (not h): 

276 return (DLDeviceType.kDLCUDA, self.device_id) 

277 if d and h: 

278 # TODO: this can also be kDLCUDAManaged, we need more fine-grained checks 

279 return (DLDeviceType.kDLCUDAHost, 0) 

280 if (not d) and h: 

281 return (DLDeviceType.kDLCPU, 0) 

282 raise BufferError("buffer is neither device-accessible nor host-accessible") 

283  

284 def __buffer__(self, flags: int, /) -> memoryview: 

285 # Support for Python-level buffer protocol as per PEP 688. 

286 # This raises a BufferError unless: 

287 # 1. Python is 3.12+ 

288 # 2. This Buffer object is host accessible 

289 raise NotImplementedError("WIP: Buffer.__buffer__ hasn't been implemented yet.") 

290  

291 def __release_buffer__(self, buffer: memoryview, /): 

292 # Supporting method paired with __buffer__. 

293 raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.") 

294  

295 @property 

296 def device_id(self) -> int: 

297 """Return the device ordinal of this buffer.""" 

298 if self._memory_resource is not None: 

299 return self._memory_resource.device_id 

300 raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") 

301  

302 @property 

303 def handle(self) -> DevicePointerT: 

304 """Return the buffer handle object. 

305  

306 .. caution:: 

307  

308 This handle is a Python object. To get the memory address of the underlying C 

309 handle, call ``int(Buffer.handle)``. 

310 """ 

311 if self._ptr_obj is not None: 

312 return self._ptr_obj 

313 elif self._ptr: 

314 return self._ptr 

315 else: 

316 # contract: Buffer is closed 

317 return 0 

318  

319 @property 

320 def is_device_accessible(self) -> bool: 

321 """Return True if this buffer can be accessed by the GPU, otherwise False.""" 

322 if self._memory_resource is not None: 

323 return self._memory_resource.is_device_accessible 

324 raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") 

325  

326 @property 

327 def is_host_accessible(self) -> bool: 

328 """Return True if this buffer can be accessed by the CPU, otherwise False.""" 

329 if self._memory_resource is not None: 

330 return self._memory_resource.is_host_accessible 

331 raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") 

332  

333 @property 

334 def is_mapped(self) -> bool: 

335 """Return True if this buffer is mapped into the process via IPC.""" 

336 return getattr(self._ipc_data, "is_mapped", False) 

337  

338  

339 @property 

340 def memory_resource(self) -> MemoryResource: 

341 """Return the memory resource associated with this buffer.""" 

342 return self._memory_resource 

343  

344 @property 

345 def size(self) -> int: 

346 """Return the memory size of this buffer.""" 

347 return self._size 

348  

349  

350# Buffer Implementation 

351# --------------------- 

352cdef inline void Buffer_close(Buffer self, stream): 

353 cdef Stream s 

354 if self._ptr and self._memory_resource is not None: 

355 s = Stream_accept(stream) if stream is not None else self._alloc_stream 

356 self._memory_resource.deallocate(self._ptr, self._size, s) 

357 self._ptr = 0 

358 self._memory_resource = None 

359 self._ptr_obj = None 

360 self._alloc_stream = None 

361  

362  

363cdef class MemoryResource: 

364 """Abstract base class for memory resources that manage allocation and 

365 deallocation of buffers. 

366  

367 Subclasses must implement methods for allocating and deallocation, as well 

368 as properties associated with this memory resource from which all allocated 

369 buffers will inherit. (Since all :class:`Buffer` instances allocated and 

370 returned by the :meth:`allocate` method would hold a reference to self, the 

371 buffer properties are retrieved simply by looking up the underlying memory 

372 resource's respective property.) 

373 """ 

374  

375 @abc.abstractmethod 

376 def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer: 

377 """Allocate a buffer of the requested size. 

378  

379 Parameters 

380 ---------- 

381 size : int 

382 The size of the buffer to allocate, in bytes. 

383 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional 

384 The stream on which to perform the allocation asynchronously. 

385 If None, it is up to each memory resource implementation to decide 

386 and document the behavior. 

387  

388 Returns 

389 ------- 

390 Buffer 

391 The allocated buffer object, which can be used for device or host operations 

392 depending on the resource's properties. 

393 """ 

394 ... 

395  

396 @abc.abstractmethod 

397 def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None): 

398 """Deallocate a buffer previously allocated by this resource. 

399  

400 Parameters 

401 ---------- 

402 ptr : :obj:`~_memory.DevicePointerT` 

403 The pointer or handle to the buffer to deallocate. 

404 size : int 

405 The size of the buffer to deallocate, in bytes. 

406 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional 

407 The stream on which to perform the deallocation asynchronously. 

408 If None, it is up to each memory resource implementation to decide 

409 and document the behavior. 

410 """ 

411 ... 

412  

413  

414# Helper Functions 

415# ---------------- 

416cdef void _validate_value_against_bitwidth(int bitwidth, int64_t value, bint is_signed=False) except *: 

417 """Validate that a value fits within the representable range for a given bitwidth. 

418  

419 Parameters 

420 ---------- 

421 bitwidth : int 

422 Number of bits (e.g., 8, 16, 32) 

423 value : int64_t 

424 Value to validate 

425 is_signed : bool, optional 

426 Whether the value is signed (default: False) 

427  

428 Raises 

429 ------ 

430 ValueError 

431 If value is outside the representable range for the bitwidth 

432 """ 

433 cdef int max_bits = bitwidth 

434 assert max_bits < 64, f"bitwidth ({max_bits}) must be less than 64" 

435  

436 cdef int64_t min_value 

437 cdef uint64_t max_value_unsigned 

438 cdef int64_t max_value 

439  

440 if is_signed: 

441 min_value = -(<int64_t>1 << (max_bits - 1)) 

442 max_value = (<int64_t>1 << (max_bits - 1)) - 1 

443 else: 

444 min_value = 0 

445 max_value_unsigned = (<uint64_t>1 << max_bits) - 1 

446 max_value = <int64_t>max_value_unsigned 

447  

448 if not min_value <= value <= max_value: 

449 raise ValueError( 

450 f"value must be in range [{min_value}, {max_value}]" 

451 )