Coverage for cuda / core / _launch_config.pyx: 56.34%
71 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-22 01:37 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-22 01:37 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
5from libc.string cimport memset
7from cuda.core._device import Device
8from cuda.core._utils.cuda_utils import (
9 CUDAError,
10 cast_to_3_tuple,
11 driver,
12)
14_LAUNCH_CONFIG_ATTRS = ('grid', 'cluster', 'block', 'shmem_size', 'is_cooperative')
17cdef class LaunchConfig:
18 """Customizable launch options.
20 Note
21 ----
22 When cluster is specified, the grid parameter represents the number of
23 clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) ->
24 block (threads). Each dimension in grid specifies clusters in the grid, each dimension in
25 cluster specifies blocks per cluster, and each dimension in block specifies
26 threads per block.
28 Attributes
29 ----------
30 grid : Union[tuple, int]
31 Collection of threads that will execute a kernel function. When cluster
32 is not specified, this represents the number of blocks, otherwise
33 this represents the number of clusters.
34 cluster : Union[tuple, int]
35 Group of blocks (Thread Block Cluster) that will execute on the same
36 GPU Processing Cluster (GPC). Blocks within a cluster have access to
37 distributed shared memory and can be explicitly synchronized.
38 block : Union[tuple, int]
39 Group of threads (Thread Block) that will execute on the same
40 streaming multiprocessor (SM). Threads within a thread blocks have
41 access to shared memory and can be explicitly synchronized.
42 shmem_size : int, optional
43 Dynamic shared-memory size per thread block in bytes.
44 (Default to size 0)
45 is_cooperative : bool, optional
46 Whether this config can be used to launch a cooperative kernel.
47 """
49 # TODO: expand LaunchConfig to include other attributes
50 # Note: attributes are declared in _launch_config.pxd
52 def __init__(self, grid=None, cluster=None, block=None,
53 shmem_size=None, is_cooperative=False):
54 """Initialize LaunchConfig with validation.
56 Parameters
57 ----------
58 grid : Union[tuple, int], optional
59 Grid dimensions (number of blocks or clusters if cluster is specified)
60 cluster : Union[tuple, int], optional
61 Cluster dimensions (Thread Block Cluster)
62 block : Union[tuple, int], optional
63 Block dimensions (threads per block)
64 shmem_size : int, optional
65 Dynamic shared memory size in bytes (default: 0)
66 is_cooperative : bool, optional
67 Whether to launch as cooperative kernel (default: False)
68 """
69 # Convert and validate grid and block dimensions
70 self.grid = cast_to_3_tuple("LaunchConfig.grid", grid) 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a OdncQdPd9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd
71 self.block = cast_to_3_tuple("LaunchConfig.block", block) 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a OdncQdPd9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd
73 # FIXME: Calling Device() strictly speaking is not quite right; we should instead
74 # look up the device from stream. We probably need to defer the checks related to
75 # device compute capability or attributes.
76 # thread block clusters are supported starting H100
77 if cluster is not None: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a OdncPd9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd
78 cc = Device().compute_capability 2OdPd
79 if cc < (9, 0): 2OdPd
80 raise CUDAError( 2OdPd
81 f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" 2OdPd
82 )
83 self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster)
84 else:
85 self.cluster = None 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a nc9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd
87 # Handle shmem_size default
88 if shmem_size is None: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a nc9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd
89 self.shmem_size = 0 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+b-b/b;b?b[b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a 9b8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd
90 else:
91 self.shmem_size = shmem_size 2fb,bgb.b:b=b@b]bnc9b7
93 self.is_cooperative = is_cooperative 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a nc9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd
95 if self.is_cooperative and not Device().properties.cooperative_launch: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E !b#b$b%b'b(b)b*b+bfb,b-bgb.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcF G H I J K L M N O P Q R mcS T U V W X Y Z 0 1 2 3 4 5 6 a nc9b7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbocpccbqcrcibsctcucvcjbwcxcyczcAcBcCcDcEcFcGckbHcIclbJcKcLcMcmbNcOcPcQcRcScTcUcVcWcXcYcZc0c1c2cnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFb3c4c5c6c7c8c9c!cGb#c$cHb%c'c(c)cIb*c+c,c-c.c/c:c;c=c?c@cJb[c]cKb^c_c`c{cLb|c}c~cadbdcdddedfdgdhdidjdkdldmdMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4bndodpdqdrdsdtdud5bvdwd6bxdydhbzdAd7bBdCd8bDdEddbFdGdebHdIdJdKdLdMd
96 raise CUDAError("cooperative kernels are not supported on this device")
98 def _identity(self):
99 return tuple(getattr(self, attr) for attr in _LAUNCH_CONFIG_ATTRS) 2fbgbcbibjbkblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbhb7b8bdbeb
101 def __repr__(self):
102 """Return string representation of LaunchConfig."""
103 parts = ', '.join(f'{attr}={getattr(self, attr)!r}' for attr in _LAUNCH_CONFIG_ATTRS) 26b
104 return f"LaunchConfig({parts})" 26b
106 def __eq__(self, other) -> bool:
107 if not isinstance(other, LaunchConfig): 2fbgbcbIbJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5bhbdbeb
108 return NotImplemented 2cbIbJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b
109 return self._identity() == (<LaunchConfig>other)._identity() 2fbgbcbhbdbeb
111 def __hash__(self) -> int:
112 return hash(self._identity()) 2ibjbkblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHb7b8bdbeb
114 cdef cydriver.CUlaunchConfig _to_native_launch_config(self):
115 cdef cydriver.CUlaunchConfig drv_cfg
116 cdef cydriver.CUlaunchAttribute attr
117 memset(&drv_cfg, 0, sizeof(drv_cfg)) 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb
118 self._attrs.resize(0) 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb
120 # Handle grid dimensions and cluster configuration
121 if self.cluster is not None: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb
122 # Convert grid from cluster units to block units
123 drv_cfg.gridDimX = self.grid[0] * self.cluster[0]
124 drv_cfg.gridDimY = self.grid[1] * self.cluster[1]
125 drv_cfg.gridDimZ = self.grid[2] * self.cluster[2]
127 # Set up cluster attribute
128 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
129 attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.cluster
130 self._attrs.push_back(attr)
131 else:
132 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = self.grid 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb
134 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = self.block 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb
135 drv_cfg.sharedMemBytes = self.shmem_size 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb
137 if self.is_cooperative: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb
138 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 1a
139 attr.value.cooperative = 1 1a
140 self._attrs.push_back(attr) 1a
142 drv_cfg.numAttrs = self._attrs.size() 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb
143 drv_cfg.attrs = self._attrs.data() 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb
145 return drv_cfg 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 a 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbb
148# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above
149cpdef object _to_native_launch_config(LaunchConfig config):
150 """Convert LaunchConfig to native driver CUlaunchConfig.
152 Parameters
153 ----------
154 config : LaunchConfig
155 High-level launch configuration
157 Returns
158 -------
159 driver.CUlaunchConfig
160 Native CUDA driver launch configuration
161 """
162 cdef object drv_cfg = driver.CUlaunchConfig()
163 cdef list attrs
164 cdef object attr
165 cdef object dim
166 cdef tuple grid_blocks
168 # Handle grid dimensions and cluster configuration
169 if config.cluster is not None:
170 # Convert grid from cluster units to block units
171 grid_blocks = (
172 config.grid[0] * config.cluster[0],
173 config.grid[1] * config.cluster[1],
174 config.grid[2] * config.cluster[2],
175 )
176 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = grid_blocks
178 # Set up cluster attribute
179 attr = driver.CUlaunchAttribute()
180 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
181 dim = attr.value.clusterDim
182 dim.x, dim.y, dim.z = config.cluster
183 attrs = [attr]
184 else:
185 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
186 attrs = []
188 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
189 drv_cfg.sharedMemBytes = config.shmem_size
191 if config.is_cooperative:
192 attr = driver.CUlaunchAttribute()
193 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
194 attr.value.cooperative = 1
195 attrs.append(attr)
197 drv_cfg.numAttrs = len(attrs)
198 drv_cfg.attrs = attrs
200 return drv_cfg