Coverage for cuda / core / _launch_config.pyx: 56.34%

71 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-22 01:37 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from libc.string cimport memset 

6  

7from cuda.core._device import Device 

8from cuda.core._utils.cuda_utils import ( 

9 CUDAError, 

10 cast_to_3_tuple, 

11 driver, 

12) 

13  

14_LAUNCH_CONFIG_ATTRS = ('grid', 'cluster', 'block', 'shmem_size', 'is_cooperative') 

15  

16  

17cdef class LaunchConfig: 

18 """Customizable launch options. 

19  

20 Note 

21 ---- 

22 When cluster is specified, the grid parameter represents the number of 

23 clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) -> 

24 block (threads). Each dimension in grid specifies clusters in the grid, each dimension in 

25 cluster specifies blocks per cluster, and each dimension in block specifies 

26 threads per block. 

27  

28 Attributes 

29 ---------- 

30 grid : Union[tuple, int] 

31 Collection of threads that will execute a kernel function. When cluster 

32 is not specified, this represents the number of blocks, otherwise 

33 this represents the number of clusters. 

34 cluster : Union[tuple, int] 

35 Group of blocks (Thread Block Cluster) that will execute on the same 

36 GPU Processing Cluster (GPC). Blocks within a cluster have access to 

37 distributed shared memory and can be explicitly synchronized. 

38 block : Union[tuple, int] 

39 Group of threads (Thread Block) that will execute on the same 

40 streaming multiprocessor (SM). Threads within a thread blocks have 

41 access to shared memory and can be explicitly synchronized. 

42 shmem_size : int, optional 

43 Dynamic shared-memory size per thread block in bytes. 

44 (Default to size 0) 

45 is_cooperative : bool, optional 

46 Whether this config can be used to launch a cooperative kernel. 

47 """ 

48  

49 # TODO: expand LaunchConfig to include other attributes 

50 # Note: attributes are declared in _launch_config.pxd 

51  

52 def __init__(self, grid=None, cluster=None, block=None, 

53 shmem_size=None, is_cooperative=False): 

54 """Initialize LaunchConfig with validation. 

55  

56 Parameters 

57 ---------- 

58 grid : Union[tuple, int], optional 

59 Grid dimensions (number of blocks or clusters if cluster is specified) 

60 cluster : Union[tuple, int], optional 

61 Cluster dimensions (Thread Block Cluster) 

62 block : Union[tuple, int], optional 

63 Block dimensions (threads per block) 

64 shmem_size : int, optional 

65 Dynamic shared memory size in bytes (default: 0) 

66 is_cooperative : bool, optional 

67 Whether to launch as cooperative kernel (default: False) 

68 """ 

69 # Convert and validate grid and block dimensions 

70 self.grid = cast_to_3_tuple("LaunchConfig.grid", grid) 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a OdncQdPd9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd

71 self.block = cast_to_3_tuple("LaunchConfig.block", block) 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a OdncQdPd9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd

72  

73 # FIXME: Calling Device() strictly speaking is not quite right; we should instead 

74 # look up the device from stream. We probably need to defer the checks related to 

75 # device compute capability or attributes. 

76 # thread block clusters are supported starting H100 

77 if cluster is not None: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a OdncPd9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd

78 cc = Device().compute_capability 2OdPd

79 if cc < (9, 0): 2OdPd

80 raise CUDAError( 2OdPd

81 f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" 2OdPd

82 ) 

83 self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster) 

84 else: 

85 self.cluster = None 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a nc9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd

86  

87 # Handle shmem_size default 

88 if shmem_size is None: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a nc9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd

89 self.shmem_size = 0 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+b-b/b;b?b[b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a 9b8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd

90 else: 

91 self.shmem_size = shmem_size 2fb,bgb.b:b=b@b]bnc9b7

92  

93 self.is_cooperative = is_cooperative 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a nc9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd

94  

95 if self.is_cooperative and not Device().properties.cooperative_launch: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a nc9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd

96 raise CUDAError("cooperative kernels are not supported on this device") 

97  

98 def _identity(self): 

99 return tuple(getattr(self, attr) for attr in _LAUNCH_CONFIG_ATTRS) 2fbgbcbibjbkblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbhb7b8bdbeb

100  

101 def __repr__(self): 

102 """Return string representation of LaunchConfig.""" 

103 parts = ', '.join(f'{attr}={getattr(self, attr)!r}' for attr in _LAUNCH_CONFIG_ATTRS) 26b

104 return f"LaunchConfig({parts})" 26b

105  

106 def __eq__(self, other) -> bool: 

107 if not isinstance(other, LaunchConfig): 2fbgbcbIbJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5bhbdbeb

108 return NotImplemented 2cbIbJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b

109 return self._identity() == (<LaunchConfig>other)._identity() 2fbgbcbhbdbeb

110  

111 def __hash__(self) -> int: 

112 return hash(self._identity()) 2ibjbkblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHb7b8bdbeb

113  

114 cdef cydriver.CUlaunchConfig _to_native_launch_config(self): 

115 cdef cydriver.CUlaunchConfig drv_cfg 

116 cdef cydriver.CUlaunchAttribute attr 

117 memset(&drv_cfg, 0, sizeof(drv_cfg)) 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb

118 self._attrs.resize(0) 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb

119  

120 # Handle grid dimensions and cluster configuration 

121 if self.cluster is not None: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb

122 # Convert grid from cluster units to block units 

123 drv_cfg.gridDimX = self.grid[0] * self.cluster[0] 

124 drv_cfg.gridDimY = self.grid[1] * self.cluster[1] 

125 drv_cfg.gridDimZ = self.grid[2] * self.cluster[2] 

126  

127 # Set up cluster attribute 

128 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION 

129 attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.cluster 

130 self._attrs.push_back(attr) 

131 else: 

132 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = self.grid 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb

133  

134 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = self.block 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb

135 drv_cfg.sharedMemBytes = self.shmem_size 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb

136  

137 if self.is_cooperative: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb

138 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 1a

139 attr.value.cooperative = 1 1a

140 self._attrs.push_back(attr) 1a

141  

142 drv_cfg.numAttrs = self._attrs.size() 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb

143 drv_cfg.attrs = self._attrs.data() 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb

144  

145 return drv_cfg 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb

146  

147  

148# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above 

149cpdef object _to_native_launch_config(LaunchConfig config): 

150 """Convert LaunchConfig to native driver CUlaunchConfig. 

151  

152 Parameters 

153 ---------- 

154 config : LaunchConfig 

155 High-level launch configuration 

156  

157 Returns 

158 ------- 

159 driver.CUlaunchConfig 

160 Native CUDA driver launch configuration 

161 """ 

162 cdef object drv_cfg = driver.CUlaunchConfig() 

163 cdef list attrs 

164 cdef object attr 

165 cdef object dim 

166 cdef tuple grid_blocks 

167  

168 # Handle grid dimensions and cluster configuration 

169 if config.cluster is not None: 

170 # Convert grid from cluster units to block units 

171 grid_blocks = ( 

172 config.grid[0] * config.cluster[0], 

173 config.grid[1] * config.cluster[1], 

174 config.grid[2] * config.cluster[2], 

175 ) 

176 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = grid_blocks 

177  

178 # Set up cluster attribute 

179 attr = driver.CUlaunchAttribute() 

180 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION 

181 dim = attr.value.clusterDim 

182 dim.x, dim.y, dim.z = config.cluster 

183 attrs = [attr] 

184 else: 

185 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid 

186 attrs = [] 

187  

188 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block 

189 drv_cfg.sharedMemBytes = config.shmem_size 

190  

191 if config.is_cooperative: 

192 attr = driver.CUlaunchAttribute() 

193 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 

194 attr.value.cooperative = 1 

195 attrs.append(attr) 

196  

197 drv_cfg.numAttrs = len(attrs) 

198 drv_cfg.attrs = attrs 

199  

200 return drv_cfg