Coverage for cuda / core / _launch_config.pyx: 56.34%

71 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-29 01:27 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from libc.string cimport memset 

6  

7from cuda.core._device import Device 

8from cuda.core._utils.cuda_utils import ( 

9 CUDAError, 

10 cast_to_3_tuple, 

11 driver, 

12) 

13  

14_LAUNCH_CONFIG_ATTRS = ('grid', 'cluster', 'block', 'shmem_size', 'cooperative_launch') 

15  

16  

17cdef class LaunchConfig: 

18 """Customizable launch options. 

19  

20 Note 

21 ---- 

22 When cluster is specified, the grid parameter represents the number of 

23 clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) -> 

24 block (threads). Each dimension in grid specifies clusters in the grid, each dimension in 

25 cluster specifies blocks per cluster, and each dimension in block specifies 

26 threads per block. 

27  

28 Attributes 

29 ---------- 

30 grid : Union[tuple, int] 

31 Collection of threads that will execute a kernel function. When cluster 

32 is not specified, this represents the number of blocks, otherwise 

33 this represents the number of clusters. 

34 cluster : Union[tuple, int] 

35 Group of blocks (Thread Block Cluster) that will execute on the same 

36 GPU Processing Cluster (GPC). Blocks within a cluster have access to 

37 distributed shared memory and can be explicitly synchronized. 

38 block : Union[tuple, int] 

39 Group of threads (Thread Block) that will execute on the same 

40 streaming multiprocessor (SM). Threads within a thread blocks have 

41 access to shared memory and can be explicitly synchronized. 

42 shmem_size : int, optional 

43 Dynamic shared-memory size per thread block in bytes. 

44 (Default to size 0) 

45 cooperative_launch : bool, optional 

46 Whether this config can be used to launch a cooperative kernel. 

47 """ 

48  

49 # TODO: expand LaunchConfig to include other attributes 

50 # Note: attributes are declared in _launch_config.pxd 

51  

52 def __init__(self, grid=None, cluster=None, block=None, 

53 shmem_size=None, cooperative_launch=False): 

54 """Initialize LaunchConfig with validation. 

55  

56 Parameters 

57 ---------- 

58 grid : Union[tuple, int], optional 

59 Grid dimensions (number of blocks or clusters if cluster is specified) 

60 cluster : Union[tuple, int], optional 

61 Cluster dimensions (Thread Block Cluster) 

62 block : Union[tuple, int], optional 

63 Block dimensions (threads per block) 

64 shmem_size : int, optional 

65 Dynamic shared memory size in bytes (default: 0) 

66 cooperative_launch : bool, optional 

67 Whether to launch as cooperative kernel (default: False) 

68 """ 

69 # Convert and validate grid and block dimensions 

70 self.grid = cast_to_3_tuple("LaunchConfig.grid", grid) 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a JdicLdKd6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd

71 self.block = cast_to_3_tuple("LaunchConfig.block", block) 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a JdicLdKd6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd

72  

73 # FIXME: Calling Device() strictly speaking is not quite right; we should instead 

74 # look up the device from stream. We probably need to defer the checks related to 

75 # device compute capability or attributes. 

76 # thread block clusters are supported starting H100 

77 if cluster is not None: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a JdicKd6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd

78 cc = Device().compute_capability 2JdKd

79 if cc < (9, 0): 2JdKd

80 raise CUDAError( 2JdKd

81 f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" 2JdKd

82 ) 

83 self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster) 

84 else: 

85 self.cluster = None 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a ic6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd

86  

87 # Handle shmem_size default 

88 if shmem_size is None: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a ic6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd

89 self.shmem_size = 0 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(b*b,b.b:b=b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a 6b5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd

90 else: 

91 self.shmem_size = shmem_size 2cb)bdb+b-b/b;b?bic6b4

92  

93 # Handle cooperative_launch 

94 self.cooperative_launch = cooperative_launch 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a ic6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd

95  

96 # Validate cooperative launch support 

97 if self.cooperative_launch and not Device().properties.cooperative_launch: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a ic6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd

98 raise CUDAError("cooperative kernels are not supported on this device") 

99  

100 def _identity(self): 

101 return tuple(getattr(self, attr) for attr in _LAUNCH_CONFIG_ATTRS) 2cbdb~ fbgbhbibjbkblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbeb4b5babbb

102  

103 def __repr__(self): 

104 """Return string representation of LaunchConfig.""" 

105 parts = ', '.join(f'{attr}={getattr(self, attr)!r}' for attr in _LAUNCH_CONFIG_ATTRS) 23b

106 return f"LaunchConfig({parts})" 23b

107  

108 def __eq__(self, other) -> bool: 

109 if not isinstance(other, LaunchConfig): 2cbdb~ FbGbHbIbJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2bebabbb

110 return NotImplemented 2~ FbGbHbIbJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b

111 return self._identity() == (<LaunchConfig>other)._identity() 2cbdb~ ebabbb

112  

113 def __hash__(self) -> int: 

114 return hash(self._identity()) 2fbgbhbibjbkblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEb4b5babbb

115  

116 cdef cydriver.CUlaunchConfig _to_native_launch_config(self): 

117 cdef cydriver.CUlaunchConfig drv_cfg 

118 cdef cydriver.CUlaunchAttribute attr 

119 memset(&drv_cfg, 0, sizeof(drv_cfg)) 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}

120 self._attrs.resize(0) 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}

121  

122 # Handle grid dimensions and cluster configuration 

123 if self.cluster is not None: 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}

124 # Convert grid from cluster units to block units 

125 drv_cfg.gridDimX = self.grid[0] * self.cluster[0] 

126 drv_cfg.gridDimY = self.grid[1] * self.cluster[1] 

127 drv_cfg.gridDimZ = self.grid[2] * self.cluster[2] 

128  

129 # Set up cluster attribute 

130 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION 

131 attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.cluster 

132 self._attrs.push_back(attr) 

133 else: 

134 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = self.grid 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}

135  

136 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = self.block 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}

137 drv_cfg.sharedMemBytes = self.shmem_size 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}

138  

139 if self.cooperative_launch: 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}

140 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 1a

141 attr.value.cooperative = 1 1a

142 self._attrs.push_back(attr) 1a

143  

144 drv_cfg.numAttrs = self._attrs.size() 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}

145 drv_cfg.attrs = self._attrs.data() 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}

146  

147 return drv_cfg 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}

148  

149  

150# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above 

151cpdef object _to_native_launch_config(LaunchConfig config): 

152 """Convert LaunchConfig to native driver CUlaunchConfig. 

153  

154 Parameters 

155 ---------- 

156 config : LaunchConfig 

157 High-level launch configuration 

158  

159 Returns 

160 ------- 

161 driver.CUlaunchConfig 

162 Native CUDA driver launch configuration 

163 """ 

164 cdef object drv_cfg = driver.CUlaunchConfig() 

165 cdef list attrs 

166 cdef object attr 

167 cdef object dim 

168 cdef tuple grid_blocks 

169  

170 # Handle grid dimensions and cluster configuration 

171 if config.cluster is not None: 

172 # Convert grid from cluster units to block units 

173 grid_blocks = ( 

174 config.grid[0] * config.cluster[0], 

175 config.grid[1] * config.cluster[1], 

176 config.grid[2] * config.cluster[2], 

177 ) 

178 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = grid_blocks 

179  

180 # Set up cluster attribute 

181 attr = driver.CUlaunchAttribute() 

182 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION 

183 dim = attr.value.clusterDim 

184 dim.x, dim.y, dim.z = config.cluster 

185 attrs = [attr] 

186 else: 

187 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid 

188 attrs = [] 

189  

190 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block 

191 drv_cfg.sharedMemBytes = config.shmem_size 

192  

193 if config.cooperative_launch: 

194 attr = driver.CUlaunchAttribute() 

195 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 

196 attr.value.cooperative = 1 

197 attrs.append(attr) 

198  

199 drv_cfg.numAttrs = len(attrs) 

200 drv_cfg.attrs = attrs 

201  

202 return drv_cfg