Coverage for cuda / core / _launch_config.pyx: 62.37%
93 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-08 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-08 01:07 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
5from libc.string cimport memset
7from cuda.core._utils.cuda_utils cimport (
8 HANDLE_RETURN,
9)
11import threading
13from cuda.core._device import Device
14from cuda.core._utils.cuda_utils import (
15 CUDAError,
16 cast_to_3_tuple,
17 driver,
18 get_binding_version,
19)
22cdef bint _inited = False
23cdef bint _use_ex = False
24cdef object _lock = threading.Lock()
26# Attribute names for identity comparison and representation
27_LAUNCH_CONFIG_ATTRS = ('grid', 'cluster', 'block', 'shmem_size', 'cooperative_launch')
30cdef int _lazy_init() except?-1:
31 global _inited, _use_ex
32 if _inited: 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b vbwbAbxbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb
33 return 0 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b vbwbAbxbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb
35 cdef tuple _py_major_minor
36 cdef int _driver_ver
37 with _lock: 1a
38 if _inited: 1a
39 return 0
41 # binding availability depends on cuda-python version
42 _py_major_minor = get_binding_version() 1a
43 HANDLE_RETURN(cydriver.cuDriverGetVersion(&_driver_ver)) 1a
44 _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) 1a
45 _inited = True 1{a
47 return 0 1a
50cdef class LaunchConfig:
51 """Customizable launch options.
53 Note
54 ----
55 When cluster is specified, the grid parameter represents the number of
56 clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) ->
57 block (threads). Each dimension in grid specifies clusters in the grid, each dimension in
58 cluster specifies blocks per cluster, and each dimension in block specifies
59 threads per block.
61 Attributes
62 ----------
63 grid : Union[tuple, int]
64 Collection of threads that will execute a kernel function. When cluster
65 is not specified, this represents the number of blocks, otherwise
66 this represents the number of clusters.
67 cluster : Union[tuple, int]
68 Group of blocks (Thread Block Cluster) that will execute on the same
69 GPU Processing Cluster (GPC). Blocks within a cluster have access to
70 distributed shared memory and can be explicitly synchronized.
71 block : Union[tuple, int]
72 Group of threads (Thread Block) that will execute on the same
73 streaming multiprocessor (SM). Threads within a thread blocks have
74 access to shared memory and can be explicitly synchronized.
75 shmem_size : int, optional
76 Dynamic shared-memory size per thread block in bytes.
77 (Default to size 0)
78 cooperative_launch : bool, optional
79 Whether this config can be used to launch a cooperative kernel.
80 """
82 # TODO: expand LaunchConfig to include other attributes
83 # Note: attributes are declared in _launch_config.pxd
85 def __init__(self, grid=None, cluster=None, block=None,
86 shmem_size=None, cooperative_launch=False):
87 """Initialize LaunchConfig with validation.
89 Parameters
90 ----------
91 grid : Union[tuple, int], optional
92 Grid dimensions (number of blocks or clusters if cluster is specified)
93 cluster : Union[tuple, int], optional
94 Cluster dimensions (Thread Block Cluster)
95 block : Union[tuple, int], optional
96 Block dimensions (threads per block)
97 shmem_size : int, optional
98 Dynamic shared memory size in bytes (default: 0)
99 cooperative_launch : bool, optional
100 Whether to launch as cooperative kernel (default: False)
101 """
102 _lazy_init() 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b vbwbAbxbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb
104 # Convert and validate grid and block dimensions
105 self.grid = cast_to_3_tuple("LaunchConfig.grid", grid) 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b vbwbAbxbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb
106 self.block = cast_to_3_tuple("LaunchConfig.block", block) 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b vbwbAbxbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb
108 # FIXME: Calling Device() strictly speaking is not quite right; we should instead
109 # look up the device from stream. We probably need to defer the checks related to
110 # device compute capability or attributes.
111 # thread block clusters are supported starting H100
112 if cluster is not None: 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b vbwbxbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb
113 if not _use_ex: 2vbxb
114 err, drvers = driver.cuDriverGetVersion()
115 drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else ""
116 raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}")
117 cc = Device().compute_capability 2vbxb
118 if cc < (9, 0): 2vbxb
119 raise CUDAError( 2vbxb
120 f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" 2vbxb
121 )
122 self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster)
123 else:
124 self.cluster = None 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b wbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb
126 # Handle shmem_size default
127 if shmem_size is None: 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b wbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb
128 self.shmem_size = 0 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b ub0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb
129 else:
130 self.shmem_size = shmem_size 2wbubZ
132 # Handle cooperative_launch
133 self.cooperative_launch = cooperative_launch 2{ a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b wbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb
135 # Validate cooperative launch support
136 if self.cooperative_launch and not Device().properties.cooperative_launch: 2a c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y b wbubZ 0 1 2 3 4 5 6 7 8 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` | bbcbdbebfbgbhbibjbkblbmbnbobpbqbrbabsbtb} ~ ybzb
137 raise CUDAError("cooperative kernels are not supported on this device")
139 def _identity(self):
140 return tuple(getattr(self, attr) for attr in _LAUNCH_CONFIG_ATTRS) 2| bbcbdbebfbgbhbibjbabsbtb} ~
142 def __repr__(self):
143 """Return string representation of LaunchConfig."""
144 parts = ', '.join(f'{attr}={getattr(self, attr)!r}' for attr in _LAUNCH_CONFIG_ATTRS) 2rb
145 return f"LaunchConfig({parts})" 2rb
147 def __eq__(self, other) -> bool:
148 if not isinstance(other, LaunchConfig): 2| kblbmbnbobpbqbab} ~
149 return NotImplemented 2| kblbmbnbobpbqb
150 return self._identity() == (<LaunchConfig>other)._identity() 2| ab} ~
152 def __hash__(self) -> int:
153 return hash(self._identity()) 2bbcbdbebfbgbhbibjbsbtb} ~
155 cdef cydriver.CUlaunchConfig _to_native_launch_config(self):
156 _lazy_init() 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`
157 cdef cydriver.CUlaunchConfig drv_cfg
158 cdef cydriver.CUlaunchAttribute attr
159 memset(&drv_cfg, 0, sizeof(drv_cfg)) 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`
160 self._attrs.resize(0) 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`
162 # Handle grid dimensions and cluster configuration
163 if self.cluster is not None: 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`
164 # Convert grid from cluster units to block units
165 drv_cfg.gridDimX = self.grid[0] * self.cluster[0]
166 drv_cfg.gridDimY = self.grid[1] * self.cluster[1]
167 drv_cfg.gridDimZ = self.grid[2] * self.cluster[2]
169 # Set up cluster attribute
170 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
171 attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.cluster
172 self._attrs.push_back(attr)
173 else:
174 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = self.grid 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`
176 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = self.block 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`
177 drv_cfg.sharedMemBytes = self.shmem_size 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`
179 if self.cooperative_launch: 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`
180 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 1b
181 attr.value.cooperative = 1 1b
182 self._attrs.push_back(attr) 1b
184 drv_cfg.numAttrs = self._attrs.size() 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`
185 drv_cfg.attrs = self._attrs.data() 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`
187 return drv_cfg 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`
190# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above
191cpdef object _to_native_launch_config(LaunchConfig config):
192 """Convert LaunchConfig to native driver CUlaunchConfig.
194 Parameters
195 ----------
196 config : LaunchConfig
197 High-level launch configuration
199 Returns
200 -------
201 driver.CUlaunchConfig
202 Native CUDA driver launch configuration
203 """
204 _lazy_init()
206 cdef object drv_cfg = driver.CUlaunchConfig()
207 cdef list attrs
208 cdef object attr
209 cdef object dim
210 cdef tuple grid_blocks
212 # Handle grid dimensions and cluster configuration
213 if config.cluster is not None:
214 # Convert grid from cluster units to block units
215 grid_blocks = (
216 config.grid[0] * config.cluster[0],
217 config.grid[1] * config.cluster[1],
218 config.grid[2] * config.cluster[2],
219 )
220 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = grid_blocks
222 # Set up cluster attribute
223 attr = driver.CUlaunchAttribute()
224 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
225 dim = attr.value.clusterDim
226 dim.x, dim.y, dim.z = config.cluster
227 attrs = [attr]
228 else:
229 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
230 attrs = []
232 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
233 drv_cfg.sharedMemBytes = config.shmem_size
235 if config.cooperative_launch:
236 attr = driver.CUlaunchAttribute()
237 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
238 attr.value.cooperative = 1
239 attrs.append(attr)
241 drv_cfg.numAttrs = len(attrs)
242 drv_cfg.attrs = attrs
244 return drv_cfg