Coverage for cuda / core / _launch_config.pyx: 62.37%

93 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-08 01:07 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from libc.string cimport memset 

6  

7from cuda.core._utils.cuda_utils cimport ( 

8 HANDLE_RETURN, 

9) 

10  

11import threading 

12  

13from cuda.core._device import Device 

14from cuda.core._utils.cuda_utils import ( 

15 CUDAError, 

16 cast_to_3_tuple, 

17 driver, 

18 get_binding_version, 

19) 

20  

21  

22cdef bint _inited = False 

23cdef bint _use_ex = False 

24cdef object _lock = threading.Lock() 

25  

26# Attribute names for identity comparison and representation 

27_LAUNCH_CONFIG_ATTRS = ('grid', 'cluster', 'block', 'shmem_size', 'cooperative_launch') 

28  

29  

30cdef int _lazy_init() except?-1: 

31 global _inited, _use_ex 

32 if _inited: 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b vbwbAbxbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb

33 return 0 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b vbwbAbxbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb

34  

35 cdef tuple _py_major_minor 

36 cdef int _driver_ver 

37 with _lock: 1a

38 if _inited: 1a

39 return 0 

40  

41 # binding availability depends on cuda-python version 

42 _py_major_minor = get_binding_version() 1a

43 HANDLE_RETURN(cydriver.cuDriverGetVersion(&_driver_ver)) 1a

44 _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) 1a

45 _inited = True 1{a

46  

47 return 0 1a

48  

49  

50cdef class LaunchConfig: 

51 """Customizable launch options. 

52  

53 Note 

54 ---- 

55 When cluster is specified, the grid parameter represents the number of 

56 clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) -> 

57 block (threads). Each dimension in grid specifies clusters in the grid, each dimension in 

58 cluster specifies blocks per cluster, and each dimension in block specifies 

59 threads per block. 

60  

61 Attributes 

62 ---------- 

63 grid : Union[tuple, int] 

64 Collection of threads that will execute a kernel function. When cluster 

65 is not specified, this represents the number of blocks, otherwise 

66 this represents the number of clusters. 

67 cluster : Union[tuple, int] 

68 Group of blocks (Thread Block Cluster) that will execute on the same 

69 GPU Processing Cluster (GPC). Blocks within a cluster have access to 

70 distributed shared memory and can be explicitly synchronized. 

71 block : Union[tuple, int] 

72 Group of threads (Thread Block) that will execute on the same 

73 streaming multiprocessor (SM). Threads within a thread blocks have 

74 access to shared memory and can be explicitly synchronized. 

75 shmem_size : int, optional 

76 Dynamic shared-memory size per thread block in bytes. 

77 (Default to size 0) 

78 cooperative_launch : bool, optional 

79 Whether this config can be used to launch a cooperative kernel. 

80 """ 

81  

82 # TODO: expand LaunchConfig to include other attributes 

83 # Note: attributes are declared in _launch_config.pxd 

84  

85 def __init__(self, grid=None, cluster=None, block=None, 

86 shmem_size=None, cooperative_launch=False): 

87 """Initialize LaunchConfig with validation. 

88  

89 Parameters 

90 ---------- 

91 grid : Union[tuple, int], optional 

92 Grid dimensions (number of blocks or clusters if cluster is specified) 

93 cluster : Union[tuple, int], optional 

94 Cluster dimensions (Thread Block Cluster) 

95 block : Union[tuple, int], optional 

96 Block dimensions (threads per block) 

97 shmem_size : int, optional 

98 Dynamic shared memory size in bytes (default: 0) 

99 cooperative_launch : bool, optional 

100 Whether to launch as cooperative kernel (default: False) 

101 """ 

102 _lazy_init() 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b vbwbAbxbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb

103  

104 # Convert and validate grid and block dimensions 

105 self.grid = cast_to_3_tuple("LaunchConfig.grid", grid) 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b vbwbAbxbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb

106 self.block = cast_to_3_tuple("LaunchConfig.block", block) 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b vbwbAbxbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb

107  

108 # FIXME: Calling Device() strictly speaking is not quite right; we should instead 

109 # look up the device from stream. We probably need to defer the checks related to 

110 # device compute capability or attributes. 

111 # thread block clusters are supported starting H100 

112 if cluster is not None: 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b vbwbxbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb

113 if not _use_ex: 2vbxb

114 err, drvers = driver.cuDriverGetVersion() 

115 drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else "" 

116 raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}") 

117 cc = Device().compute_capability 2vbxb

118 if cc < (9, 0): 2vbxb

119 raise CUDAError( 2vbxb

120 f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" 2vbxb

121 ) 

122 self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster) 

123 else: 

124 self.cluster = None 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b wbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb

125  

126 # Handle shmem_size default 

127 if shmem_size is None: 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b wbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb

128 self.shmem_size = 0 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b ub0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb

129 else: 

130 self.shmem_size = shmem_size 2wbubZ

131  

132 # Handle cooperative_launch 

133 self.cooperative_launch = cooperative_launch 2{ a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b wbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb

134  

135 # Validate cooperative launch support 

136 if self.cooperative_launch and not Device().properties.cooperative_launch: 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b wbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb

137 raise CUDAError("cooperative kernels are not supported on this device") 

138  

139 def _identity(self): 

140 return tuple(getattr(self, attr) for attr in _LAUNCH_CONFIG_ATTRS) 2| bbcbdbebfbgbhbibjbabsbtb} ~

141  

142 def __repr__(self): 

143 """Return string representation of LaunchConfig.""" 

144 parts = ', '.join(f'{attr}={getattr(self, attr)!r}' for attr in _LAUNCH_CONFIG_ATTRS) 2rb

145 return f"LaunchConfig({parts})" 2rb

146  

147 def __eq__(self, other) -> bool: 

148 if not isinstance(other, LaunchConfig): 2| kblbmbnbobpbqbab} ~

149 return NotImplemented 2| kblbmbnbobpbqb

150 return self._identity() == (<LaunchConfig>other)._identity() 2| ab} ~

151  

152 def __hash__(self) -> int: 

153 return hash(self._identity()) 2bbcbdbebfbgbhbibjbsbtb} ~

154  

155 cdef cydriver.CUlaunchConfig _to_native_launch_config(self): 

156 _lazy_init() 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

157 cdef cydriver.CUlaunchConfig drv_cfg 

158 cdef cydriver.CUlaunchAttribute attr 

159 memset(&drv_cfg, 0, sizeof(drv_cfg)) 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

160 self._attrs.resize(0) 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

161  

162 # Handle grid dimensions and cluster configuration 

163 if self.cluster is not None: 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

164 # Convert grid from cluster units to block units 

165 drv_cfg.gridDimX = self.grid[0] * self.cluster[0] 

166 drv_cfg.gridDimY = self.grid[1] * self.cluster[1] 

167 drv_cfg.gridDimZ = self.grid[2] * self.cluster[2] 

168  

169 # Set up cluster attribute 

170 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION 

171 attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.cluster 

172 self._attrs.push_back(attr) 

173 else: 

174 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = self.grid 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

175  

176 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = self.block 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

177 drv_cfg.sharedMemBytes = self.shmem_size 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

178  

179 if self.cooperative_launch: 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

180 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 1b

181 attr.value.cooperative = 1 1b

182 self._attrs.push_back(attr) 1b

183  

184 drv_cfg.numAttrs = self._attrs.size() 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

185 drv_cfg.attrs = self._attrs.data() 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

186  

187 return drv_cfg 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

188  

189  

190# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above 

191cpdef object _to_native_launch_config(LaunchConfig config): 

192 """Convert LaunchConfig to native driver CUlaunchConfig. 

193  

194 Parameters 

195 ---------- 

196 config : LaunchConfig 

197 High-level launch configuration 

198  

199 Returns 

200 ------- 

201 driver.CUlaunchConfig 

202 Native CUDA driver launch configuration 

203 """ 

204 _lazy_init() 

205  

206 cdef object drv_cfg = driver.CUlaunchConfig() 

207 cdef list attrs 

208 cdef object attr 

209 cdef object dim 

210 cdef tuple grid_blocks 

211  

212 # Handle grid dimensions and cluster configuration 

213 if config.cluster is not None: 

214 # Convert grid from cluster units to block units 

215 grid_blocks = ( 

216 config.grid[0] * config.cluster[0], 

217 config.grid[1] * config.cluster[1], 

218 config.grid[2] * config.cluster[2], 

219 ) 

220 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = grid_blocks 

221  

222 # Set up cluster attribute 

223 attr = driver.CUlaunchAttribute() 

224 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION 

225 dim = attr.value.clusterDim 

226 dim.x, dim.y, dim.z = config.cluster 

227 attrs = [attr] 

228 else: 

229 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid 

230 attrs = [] 

231  

232 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block 

233 drv_cfg.sharedMemBytes = config.shmem_size 

234  

235 if config.cooperative_launch: 

236 attr = driver.CUlaunchAttribute() 

237 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 

238 attr.value.cooperative = 1 

239 attrs.append(attr) 

240  

241 drv_cfg.numAttrs = len(attrs) 

242 drv_cfg.attrs = attrs 

243  

244 return drv_cfg