Coverage for cuda/core/_launch_config.pyx: 91.67%

72 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-13 01:38 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from libc.string cimport memset 

6  

7from typing import Any 

8  

9from cuda.core._device import Device 

10from cuda.core._utils.cuda_utils import ( 

11 CUDAError, 

12 cast_to_3_tuple, 

13 driver, 

14) 

15  

16_LAUNCH_CONFIG_ATTRS = ('grid', 'cluster', 'block', 'shmem_size', 'is_cooperative') 

17  

18  

19cdef class LaunchConfig: 

20 """Customizable launch options. 

21  

22 Note 

23 ---- 

24 When cluster is specified, the grid parameter represents the number of 

25 clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) -> 

26 block (threads). Each dimension in grid specifies clusters in the grid, each dimension in 

27 cluster specifies blocks per cluster, and each dimension in block specifies 

28 threads per block. 

29  

30 Attributes 

31 ---------- 

32 grid : Union[tuple, int] 

33 Collection of threads that will execute a kernel function. When cluster 

34 is not specified, this represents the number of blocks, otherwise 

35 this represents the number of clusters. 

36 cluster : Union[tuple, int] 

37 Group of blocks (Thread Block Cluster) that will execute on the same 

38 GPU Processing Cluster (GPC). Blocks within a cluster have access to 

39 distributed shared memory and can be explicitly synchronized. 

40 block : Union[tuple, int] 

41 Group of threads (Thread Block) that will execute on the same 

42 streaming multiprocessor (SM). Threads within a thread blocks have 

43 access to shared memory and can be explicitly synchronized. 

44 shmem_size : int, optional 

45 Dynamic shared-memory size per thread block in bytes. 

46 (Default to size 0) 

47 is_cooperative : bool, optional 

48 Whether this config can be used to launch a cooperative kernel. 

49 """ 

50  

51 # TODO: expand LaunchConfig to include other attributes 

52 # Note: attributes are declared in _launch_config.pxd 

53  

54 def __init__( 

55 self, 

56 grid: int | tuple[int, ...] | None = None, 

57 cluster: int | tuple[int, ...] | None = None, 

58 block: int | tuple[int, ...] | None = None, 

59 shmem_size: int | None = None, 

60 is_cooperative: bool = False, 

61 ) -> None: 

62 """Initialize LaunchConfig with validation. 

63  

64 Parameters 

65 ---------- 

66 grid : Union[tuple, int], optional 

67 Grid dimensions (number of blocks or clusters if cluster is specified) 

68 cluster : Union[tuple, int], optional 

69 Cluster dimensions (Thread Block Cluster) 

70 block : Union[tuple, int], optional 

71 Block dimensions (threads per block) 

72 shmem_size : int, optional 

73 Dynamic shared memory size in bytes (default: 0) 

74 is_cooperative : bool, optional 

75 Whether to launch as cooperative kernel (default: False) 

76 """ 

77 # Convert and validate grid and block dimensions 

78 self.grid = cast_to_3_tuple("LaunchConfig.grid", grid) 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lbTdUd%btcWdVd'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd

79 self.block = cast_to_3_tuple("LaunchConfig.block", block) 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lbTdUd%btcWdVd'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd

80  

81 # FIXME: Calling Device() strictly speaking is not quite right; we should instead 

82 # look up the device from stream. We probably need to defer the checks related to 

83 # device compute capability or attributes. 

84 # thread block clusters are supported starting H100 

85 if cluster is not None: 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lbTdUd%btcVd'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd

86 cc = Device().compute_capability 2lbTdUdVd

87 if cc < (9, 0): 2lbTdUdVd

88 raise CUDAError( 2TdUdVd

89 f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" 2TdUdVd

90 ) 

91 self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster) 2lb

92 else: 

93 self.cluster = None 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c %btc'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd

94  

95 # Handle shmem_size default 

96 if shmem_size is None: 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lb%btc'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd

97 self.shmem_size = 0 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;b?b[b^b`b|b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lb%b'b! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b ebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd

98 else: 

99 self.shmem_size = shmem_size 2ib=bjb@b]b_b{b}btc'b9 db

100  

101 self.is_cooperative = is_cooperative 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lb%btc'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd

102  

103 if self.is_cooperative and not Device().properties.cooperative_launch: 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lb%btc'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd

104 raise CUDAError("cooperative kernels are not supported on this device") 2%b

105  

106 def _identity(self) -> tuple[Any, ...]: 

107 return tuple(getattr(self, attr) for attr in _LAUNCH_CONFIG_ATTRS) 2ibjbfbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJbKbLbkb#b$bgbhb

108  

109 def __repr__(self) -> str: 

110 """Return string representation of LaunchConfig.""" 

111 parts = ', '.join(f'{attr}={getattr(self, attr)!r}' for attr in _LAUNCH_CONFIG_ATTRS) 2!b

112 return f"LaunchConfig({parts})" 2!b

113  

114 def __eq__(self, other: object) -> bool: 

115 if not isinstance(other, LaunchConfig): 2ibjbfbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8b9bkbgbhb

116 return NotImplemented 2fbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8b9b

117 return self._identity() == (<LaunchConfig>other)._identity() 2ibjbfbkbgbhb

118  

119 def __hash__(self) -> int: 

120 return hash(self._identity()) 2mbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJbKbLb#b$bgbhb

121  

122 cdef cydriver.CUlaunchConfig _to_native_launch_config(self): 

123 cdef cydriver.CUlaunchConfig drv_cfg 

124 cdef cydriver.CUlaunchAttribute attr 

125 memset(&drv_cfg, 0, sizeof(drv_cfg)) 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb

126 self._attrs.resize(0) 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb

127  

128 # Handle grid dimensions and cluster configuration 

129 if self.cluster is not None: 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb

130 # Convert grid from cluster units to block units 

131 drv_cfg.gridDimX = self.grid[0] * self.cluster[0] 

132 drv_cfg.gridDimY = self.grid[1] * self.cluster[1] 

133 drv_cfg.gridDimZ = self.grid[2] * self.cluster[2] 

134  

135 # Set up cluster attribute 

136 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION 

137 attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.cluster 

138 self._attrs.push_back(attr) 

139 else: 

140 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = self.grid 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb

141  

142 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = self.block 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb

143 drv_cfg.sharedMemBytes = self.shmem_size 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb

144  

145 if self.is_cooperative: 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb

146 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 1c

147 attr.value.cooperative = 1 1c

148 self._attrs.push_back(attr) 1c

149  

150 drv_cfg.numAttrs = self._attrs.size() 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb

151 drv_cfg.attrs = self._attrs.data() 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb

152  

153 return drv_cfg 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb

154  

155  

156# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above 

157cpdef object _to_native_launch_config(LaunchConfig config): 

158 """Convert LaunchConfig to native driver CUlaunchConfig. 

159  

160 Parameters 

161 ---------- 

162 config : LaunchConfig 

163 High-level launch configuration 

164  

165 Returns 

166 ------- 

167 driver.CUlaunchConfig 

168 Native CUDA driver launch configuration 

169 """ 

170 cdef object drv_cfg = driver.CUlaunchConfig() 2a b db

171 cdef list attrs 

172 cdef object attr 

173 cdef object dim 

174 cdef tuple grid_blocks 

175  

176 # Handle grid dimensions and cluster configuration 

177 if config.cluster is not None: 2a b db

178 # Convert grid from cluster units to block units 

179 grid_blocks = ( 

180 config.grid[0] * config.cluster[0], 1a

181 config.grid[1] * config.cluster[1], 1a

182 config.grid[2] * config.cluster[2], 1a

183 ) 

184 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = grid_blocks 1a

185  

186 # Set up cluster attribute 

187 attr = driver.CUlaunchAttribute() 1a

188 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION 1a

189 dim = attr.value.clusterDim 1a

190 dim.x, dim.y, dim.z = config.cluster 1a

191 attrs = [attr] 1a

192 else: 

193 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid 2b db

194 attrs = [] 2b db

195  

196 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block 2a b db

197 drv_cfg.sharedMemBytes = config.shmem_size 2a b db

198  

199 if config.is_cooperative: 2a b db

200 attr = driver.CUlaunchAttribute() 1b

201 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 1b

202 attr.value.cooperative = 1 1b

203 attrs.append(attr) 1b

204  

205 drv_cfg.numAttrs = len(attrs) 2a b db

206 drv_cfg.attrs = attrs 2a b db

207  

208 return drv_cfg 2a b db