Coverage for cuda / core / _launch_config.pyx: 56.34%
71 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-29 01:27 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-29 01:27 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
5from libc.string cimport memset
7from cuda.core._device import Device
8from cuda.core._utils.cuda_utils import (
9 CUDAError,
10 cast_to_3_tuple,
11 driver,
12)
14_LAUNCH_CONFIG_ATTRS = ('grid', 'cluster', 'block', 'shmem_size', 'cooperative_launch')
17cdef class LaunchConfig:
18 """Customizable launch options.
20 Note
21 ----
22 When cluster is specified, the grid parameter represents the number of
23 clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) ->
24 block (threads). Each dimension in grid specifies clusters in the grid, each dimension in
25 cluster specifies blocks per cluster, and each dimension in block specifies
26 threads per block.
28 Attributes
29 ----------
30 grid : Union[tuple, int]
31 Collection of threads that will execute a kernel function. When cluster
32 is not specified, this represents the number of blocks, otherwise
33 this represents the number of clusters.
34 cluster : Union[tuple, int]
35 Group of blocks (Thread Block Cluster) that will execute on the same
36 GPU Processing Cluster (GPC). Blocks within a cluster have access to
37 distributed shared memory and can be explicitly synchronized.
38 block : Union[tuple, int]
39 Group of threads (Thread Block) that will execute on the same
40 streaming multiprocessor (SM). Threads within a thread blocks have
41 access to shared memory and can be explicitly synchronized.
42 shmem_size : int, optional
43 Dynamic shared-memory size per thread block in bytes.
44 (Default to size 0)
45 cooperative_launch : bool, optional
46 Whether this config can be used to launch a cooperative kernel.
47 """
49 # TODO: expand LaunchConfig to include other attributes
50 # Note: attributes are declared in _launch_config.pxd
52 def __init__(self, grid=None, cluster=None, block=None,
53 shmem_size=None, cooperative_launch=False):
54 """Initialize LaunchConfig with validation.
56 Parameters
57 ----------
58 grid : Union[tuple, int], optional
59 Grid dimensions (number of blocks or clusters if cluster is specified)
60 cluster : Union[tuple, int], optional
61 Cluster dimensions (Thread Block Cluster)
62 block : Union[tuple, int], optional
63 Block dimensions (threads per block)
64 shmem_size : int, optional
65 Dynamic shared memory size in bytes (default: 0)
66 cooperative_launch : bool, optional
67 Whether to launch as cooperative kernel (default: False)
68 """
69 # Convert and validate grid and block dimensions
70 self.grid = cast_to_3_tuple("LaunchConfig.grid", grid) 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a JdicLdKd6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd
71 self.block = cast_to_3_tuple("LaunchConfig.block", block) 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a JdicLdKd6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd
73 # FIXME: Calling Device() strictly speaking is not quite right; we should instead
74 # look up the device from stream. We probably need to defer the checks related to
75 # device compute capability or attributes.
76 # thread block clusters are supported starting H100
77 if cluster is not None: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a JdicKd6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd
78 cc = Device().compute_capability 2JdKd
79 if cc < (9, 0): 2JdKd
80 raise CUDAError( 2JdKd
81 f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" 2JdKd
82 )
83 self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster)
84 else:
85 self.cluster = None 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a ic6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd
87 # Handle shmem_size default
88 if shmem_size is None: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a ic6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd
89 self.shmem_size = 0 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(b*b,b.b:b=b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a 6b5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd
90 else:
91 self.shmem_size = shmem_size 2cb)bdb+b-b/b;b?bic6b4
93 # Handle cooperative_launch
94 self.cooperative_launch = cooperative_launch 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a ic6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd
96 # Validate cooperative launch support
97 if self.cooperative_launch and not Device().properties.cooperative_launch: 2b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E 7b8b9b!b#b$b%b'b(bcb)b*bdb+b,b-b.b/b:b;b=b?b@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgcF G H I J K L M N O P Q R hcS T U V W X Y Z 0 1 2 3 a ic6b4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } jckc~ lcmcfbncocpcqcgbrcsctcucvcwcxcyczcAcBchbCcDcibEcFcGcHcjbIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXckblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbYcZc0c1c2c3c4c5cDb6c7cEb8c9c!c#cFb$c%c'c(c)c*c+c,c-c.c/cGb:c;cHb=c?c@c[cIb]c^c_c`c{c|c}c~cadbdcdddedfdgdhdJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1bidjdkdldmdndodpd2bqdrd3bsdtdebudvd4bwdxd5bydzdabAdBdbbCdDdEdFdGdHd
98 raise CUDAError("cooperative kernels are not supported on this device")
100 def _identity(self):
101 return tuple(getattr(self, attr) for attr in _LAUNCH_CONFIG_ATTRS) 2cbdb~ fbgbhbibjbkblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbeb4b5babbb
103 def __repr__(self):
104 """Return string representation of LaunchConfig."""
105 parts = ', '.join(f'{attr}={getattr(self, attr)!r}' for attr in _LAUNCH_CONFIG_ATTRS) 23b
106 return f"LaunchConfig({parts})" 23b
108 def __eq__(self, other) -> bool:
109 if not isinstance(other, LaunchConfig): 2cbdb~ FbGbHbIbJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2bebabbb
110 return NotImplemented 2~ FbGbHbIbJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b
111 return self._identity() == (<LaunchConfig>other)._identity() 2cbdb~ ebabbb
113 def __hash__(self) -> int:
114 return hash(self._identity()) 2fbgbhbibjbkblbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEb4b5babbb
116 cdef cydriver.CUlaunchConfig _to_native_launch_config(self):
117 cdef cydriver.CUlaunchConfig drv_cfg
118 cdef cydriver.CUlaunchAttribute attr
119 memset(&drv_cfg, 0, sizeof(drv_cfg)) 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}
120 self._attrs.resize(0) 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}
122 # Handle grid dimensions and cluster configuration
123 if self.cluster is not None: 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}
124 # Convert grid from cluster units to block units
125 drv_cfg.gridDimX = self.grid[0] * self.cluster[0]
126 drv_cfg.gridDimY = self.grid[1] * self.cluster[1]
127 drv_cfg.gridDimZ = self.grid[2] * self.cluster[2]
129 # Set up cluster attribute
130 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
131 attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.cluster
132 self._attrs.push_back(attr)
133 else:
134 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = self.grid 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}
136 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = self.block 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}
137 drv_cfg.sharedMemBytes = self.shmem_size 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}
139 if self.cooperative_launch: 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}
140 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 1a
141 attr.value.cooperative = 1 1a
142 self._attrs.push_back(attr) 1a
144 drv_cfg.numAttrs = self._attrs.size() 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}
145 drv_cfg.attrs = self._attrs.data() 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}
147 return drv_cfg 1bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123a456789!#$%'()*+,-./:;=?@[]^_`{|}
150# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above
151cpdef object _to_native_launch_config(LaunchConfig config):
152 """Convert LaunchConfig to native driver CUlaunchConfig.
154 Parameters
155 ----------
156 config : LaunchConfig
157 High-level launch configuration
159 Returns
160 -------
161 driver.CUlaunchConfig
162 Native CUDA driver launch configuration
163 """
164 cdef object drv_cfg = driver.CUlaunchConfig()
165 cdef list attrs
166 cdef object attr
167 cdef object dim
168 cdef tuple grid_blocks
170 # Handle grid dimensions and cluster configuration
171 if config.cluster is not None:
172 # Convert grid from cluster units to block units
173 grid_blocks = (
174 config.grid[0] * config.cluster[0],
175 config.grid[1] * config.cluster[1],
176 config.grid[2] * config.cluster[2],
177 )
178 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = grid_blocks
180 # Set up cluster attribute
181 attr = driver.CUlaunchAttribute()
182 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
183 dim = attr.value.clusterDim
184 dim.x, dim.y, dim.z = config.cluster
185 attrs = [attr]
186 else:
187 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
188 attrs = []
190 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
191 drv_cfg.sharedMemBytes = config.shmem_size
193 if config.cooperative_launch:
194 attr = driver.CUlaunchAttribute()
195 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
196 attr.value.cooperative = 1
197 attrs.append(attr)
199 drv_cfg.numAttrs = len(attrs)
200 drv_cfg.attrs = attrs
202 return drv_cfg