Coverage for cuda/core/_launch_config.pyx: 91.67%
72 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-13 01:38 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-13 01:38 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
5from libc.string cimport memset
7from typing import Any
9from cuda.core._device import Device
10from cuda.core._utils.cuda_utils import (
11 CUDAError,
12 cast_to_3_tuple,
13 driver,
14)
16_LAUNCH_CONFIG_ATTRS = ('grid', 'cluster', 'block', 'shmem_size', 'is_cooperative')
19cdef class LaunchConfig:
20 """Customizable launch options.
22 Note
23 ----
24 When cluster is specified, the grid parameter represents the number of
25 clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) ->
26 block (threads). Each dimension in grid specifies clusters in the grid, each dimension in
27 cluster specifies blocks per cluster, and each dimension in block specifies
28 threads per block.
30 Attributes
31 ----------
32 grid : Union[tuple, int]
33 Collection of threads that will execute a kernel function. When cluster
34 is not specified, this represents the number of blocks, otherwise
35 this represents the number of clusters.
36 cluster : Union[tuple, int]
37 Group of blocks (Thread Block Cluster) that will execute on the same
38 GPU Processing Cluster (GPC). Blocks within a cluster have access to
39 distributed shared memory and can be explicitly synchronized.
40 block : Union[tuple, int]
41 Group of threads (Thread Block) that will execute on the same
42 streaming multiprocessor (SM). Threads within a thread blocks have
43 access to shared memory and can be explicitly synchronized.
44 shmem_size : int, optional
45 Dynamic shared-memory size per thread block in bytes.
46 (Default to size 0)
47 is_cooperative : bool, optional
48 Whether this config can be used to launch a cooperative kernel.
49 """
51 # TODO: expand LaunchConfig to include other attributes
52 # Note: attributes are declared in _launch_config.pxd
54 def __init__(
55 self,
56 grid: int | tuple[int, ...] | None = None,
57 cluster: int | tuple[int, ...] | None = None,
58 block: int | tuple[int, ...] | None = None,
59 shmem_size: int | None = None,
60 is_cooperative: bool = False,
61 ) -> None:
62 """Initialize LaunchConfig with validation.
64 Parameters
65 ----------
66 grid : Union[tuple, int], optional
67 Grid dimensions (number of blocks or clusters if cluster is specified)
68 cluster : Union[tuple, int], optional
69 Cluster dimensions (Thread Block Cluster)
70 block : Union[tuple, int], optional
71 Block dimensions (threads per block)
72 shmem_size : int, optional
73 Dynamic shared memory size in bytes (default: 0)
74 is_cooperative : bool, optional
75 Whether to launch as cooperative kernel (default: False)
76 """
77 # Convert and validate grid and block dimensions
78 self.grid = cast_to_3_tuple("LaunchConfig.grid", grid) 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lbTdUd%btcWdVd'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd
79 self.block = cast_to_3_tuple("LaunchConfig.block", block) 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lbTdUd%btcWdVd'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd
81 # FIXME: Calling Device() strictly speaking is not quite right; we should instead
82 # look up the device from stream. We probably need to defer the checks related to
83 # device compute capability or attributes.
84 # thread block clusters are supported starting H100
85 if cluster is not None: 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lbTdUd%btcVd'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd
86 cc = Device().compute_capability 2lbTdUdVd
87 if cc < (9, 0): 2lbTdUdVd
88 raise CUDAError( 2TdUdVd
89 f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" 2TdUdVd
90 )
91 self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster) 2lb
92 else:
93 self.cluster = None 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c %btc'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd
95 # Handle shmem_size default
96 if shmem_size is None: 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lb%btc'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd
97 self.shmem_size = 0 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;b?b[b^b`b|b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lb%b'b! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b ebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd
98 else:
99 self.shmem_size = shmem_size 2ib=bjb@b]b_b{b}btc'b9 db
101 self.is_cooperative = is_cooperative 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lb%btc'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd
103 if self.is_cooperative and not Device().properties.cooperative_launch: 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G )b*b+b,b-b.b/b:b;bib=b?bjb@b[b]b^b_b`b{b|b}b~bacbcccdcecfcgchcicjckclcmcncocpcqcrcH I J K L M N O P Q R S T scU V W X Y Z 0 1 2 3 4 5 6 7 8 c lb%btc'b9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcba b dbebucvcfbwcxcmbyczcAcBcnbCcDcEcFcGcHcIcJcKcLcMcobNcOcpbPcQcRcScqbTcUcVcWcXcYcZc0c1c2c3c4c5c6c7c8crbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJb9c!c#c$c%c'c(c)cKb*c+cLb,c-c.c/cMb:c;c=c?c@c[c]c^c_c`c{cNb|c}cOb~cadbdcdPbddedfdgdhdidjdkdldmdndodpdqdrdsdQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8btdudvdwdxdydzdAd9bBdCd!bDdEdkbFdGd#bHdId$bJdKdgbLdMdhbNdOdPdQdRdSd
104 raise CUDAError("cooperative kernels are not supported on this device") 2%b
106 def _identity(self) -> tuple[Any, ...]:
107 return tuple(getattr(self, attr) for attr in _LAUNCH_CONFIG_ATTRS) 2ibjbfbmbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJbKbLbkb#b$bgbhb
109 def __repr__(self) -> str:
110 """Return string representation of LaunchConfig."""
111 parts = ', '.join(f'{attr}={getattr(self, attr)!r}' for attr in _LAUNCH_CONFIG_ATTRS) 2!b
112 return f"LaunchConfig({parts})" 2!b
114 def __eq__(self, other: object) -> bool:
115 if not isinstance(other, LaunchConfig): 2ibjbfbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8b9bkbgbhb
116 return NotImplemented 2fbMbNbObPbQbRbSbTbUbVbWbXbYbZb0b1b2b3b4b5b6b7b8b9b
117 return self._identity() == (<LaunchConfig>other)._identity() 2ibjbfbkbgbhb
119 def __hash__(self) -> int:
120 return hash(self._identity()) 2mbnbobpbqbrbsbtbubvbwbxbybzbAbBbCbDbEbFbGbHbIbJbKbLb#b$bgbhb
122 cdef cydriver.CUlaunchConfig _to_native_launch_config(self):
123 cdef cydriver.CUlaunchConfig drv_cfg
124 cdef cydriver.CUlaunchAttribute attr
125 memset(&drv_cfg, 0, sizeof(drv_cfg)) 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb
126 self._attrs.resize(0) 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb
128 # Handle grid dimensions and cluster configuration
129 if self.cluster is not None: 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb
130 # Convert grid from cluster units to block units
131 drv_cfg.gridDimX = self.grid[0] * self.cluster[0]
132 drv_cfg.gridDimY = self.grid[1] * self.cluster[1]
133 drv_cfg.gridDimZ = self.grid[2] * self.cluster[2]
135 # Set up cluster attribute
136 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
137 attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.cluster
138 self._attrs.push_back(attr)
139 else:
140 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = self.grid 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb
142 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = self.block 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb
143 drv_cfg.sharedMemBytes = self.shmem_size 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb
145 if self.is_cooperative: 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb
146 attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 1c
147 attr.value.cooperative = 1 1c
148 self._attrs.push_back(attr) 1c
150 drv_cfg.numAttrs = self._attrs.size() 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb
151 drv_cfg.attrs = self._attrs.data() 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb
153 return drv_cfg 2d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 c 9 ! # $ % ' ( ) * + , - . / : ; = ? @ [ ] ^ _ ` { | } ~ abbbcbeb
156# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above
157cpdef object _to_native_launch_config(LaunchConfig config):
158 """Convert LaunchConfig to native driver CUlaunchConfig.
160 Parameters
161 ----------
162 config : LaunchConfig
163 High-level launch configuration
165 Returns
166 -------
167 driver.CUlaunchConfig
168 Native CUDA driver launch configuration
169 """
170 cdef object drv_cfg = driver.CUlaunchConfig() 2a b db
171 cdef list attrs
172 cdef object attr
173 cdef object dim
174 cdef tuple grid_blocks
176 # Handle grid dimensions and cluster configuration
177 if config.cluster is not None: 2a b db
178 # Convert grid from cluster units to block units
179 grid_blocks = (
180 config.grid[0] * config.cluster[0], 1a
181 config.grid[1] * config.cluster[1], 1a
182 config.grid[2] * config.cluster[2], 1a
183 )
184 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = grid_blocks 1a
186 # Set up cluster attribute
187 attr = driver.CUlaunchAttribute() 1a
188 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION 1a
189 dim = attr.value.clusterDim 1a
190 dim.x, dim.y, dim.z = config.cluster 1a
191 attrs = [attr] 1a
192 else:
193 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid 2b db
194 attrs = [] 2b db
196 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block 2a b db
197 drv_cfg.sharedMemBytes = config.shmem_size 2a b db
199 if config.is_cooperative: 2a b db
200 attr = driver.CUlaunchAttribute() 1b
201 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 1b
202 attr.value.cooperative = 1 1b
203 attrs.append(attr) 1b
205 drv_cfg.numAttrs = len(attrs) 2a b db
206 drv_cfg.attrs = attrs 2a b db
208 return drv_cfg 2a b db