Coverage for cuda / core / experimental / _launch_config.pyx: 72%
60 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-10 01:19 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-10 01:19 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
5from cuda.core.experimental._device import Device
6from cuda.core.experimental._utils.cuda_utils import (
7 CUDAError,
8 cast_to_3_tuple,
9 driver,
10 get_binding_version,
11 handle_return,
12)
14# TODO: revisit this treatment for py313t builds
15cdef bint _inited = False
16cdef bint _use_ex = False
19cdef void _lazy_init() except *:
20 """Initialize module-level globals for driver version checks."""
21 global _inited, _use_ex
22 if _inited:
23 return
25 cdef tuple _py_major_minor
26 cdef int _driver_ver
28 # binding availability depends on cuda-python version
29 _py_major_minor = get_binding_version()
30 _driver_ver = handle_return(driver.cuDriverGetVersion())
31 _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
32 _inited = True
35cdef class LaunchConfig:
36 """Customizable launch options.
38 Note
39 ----
40 When cluster is specified, the grid parameter represents the number of
41 clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) ->
42 block (threads). Each dimension in grid specifies clusters in the grid, each dimension in
43 cluster specifies blocks per cluster, and each dimension in block specifies
44 threads per block.
46 Attributes
47 ----------
48 grid : Union[tuple, int]
49 Collection of threads that will execute a kernel function. When cluster
50 is not specified, this represents the number of blocks, otherwise
51 this represents the number of clusters.
52 cluster : Union[tuple, int]
53 Group of blocks (Thread Block Cluster) that will execute on the same
54 GPU Processing Cluster (GPC). Blocks within a cluster have access to
55 distributed shared memory and can be explicitly synchronized.
56 block : Union[tuple, int]
57 Group of threads (Thread Block) that will execute on the same
58 streaming multiprocessor (SM). Threads within a thread blocks have
59 access to shared memory and can be explicitly synchronized.
60 shmem_size : int, optional
61 Dynamic shared-memory size per thread block in bytes.
62 (Default to size 0)
63 cooperative_launch : bool, optional
64 Whether this config can be used to launch a cooperative kernel.
65 """
67 # TODO: expand LaunchConfig to include other attributes
68 # Note: attributes are declared in _launch_config.pxd
70 def __init__(self, grid=None, cluster=None, block=None,
71 shmem_size=None, cooperative_launch=False):
72 """Initialize LaunchConfig with validation.
74 Parameters
75 ----------
76 grid : Union[tuple, int], optional
77 Grid dimensions (number of blocks or clusters if cluster is specified)
78 cluster : Union[tuple, int], optional
79 Cluster dimensions (Thread Block Cluster)
80 block : Union[tuple, int], optional
81 Block dimensions (threads per block)
82 shmem_size : int, optional
83 Dynamic shared memory size in bytes (default: 0)
84 cooperative_launch : bool, optional
85 Whether to launch as cooperative kernel (default: False)
86 """
87 _lazy_init()
89 # Convert and validate grid and block dimensions
90 self.grid = cast_to_3_tuple("LaunchConfig.grid", grid)
91 self.block = cast_to_3_tuple("LaunchConfig.block", block)
93 # FIXME: Calling Device() strictly speaking is not quite right; we should instead
94 # look up the device from stream. We probably need to defer the checks related to
95 # device compute capability or attributes.
96 # thread block clusters are supported starting H100
97 if cluster is not None:
98 if not _use_ex:
99 err, drvers = driver.cuDriverGetVersion()
100 drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else ""
101 raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}")
102 cc = Device().compute_capability
103 if cc < (9, 0):
104 raise CUDAError(
105 f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
106 )
107 self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster)
108 else:
109 self.cluster = None
111 # Handle shmem_size default
112 if shmem_size is None:
113 self.shmem_size = 0
114 else:
115 self.shmem_size = shmem_size
117 # Handle cooperative_launch
118 self.cooperative_launch = cooperative_launch
120 # Validate cooperative launch support
121 if self.cooperative_launch and not Device().properties.cooperative_launch:
122 raise CUDAError("cooperative kernels are not supported on this device")
124 def __repr__(self):
125 """Return string representation of LaunchConfig."""
126 return (f"LaunchConfig(grid={self.grid}, cluster={self.cluster}, "
127 f"block={self.block}, shmem_size={self.shmem_size}, "
128 f"cooperative_launch={self.cooperative_launch})")
131cpdef object _to_native_launch_config(LaunchConfig config):
132 """Convert LaunchConfig to native driver CUlaunchConfig.
134 Parameters
135 ----------
136 config : LaunchConfig
137 High-level launch configuration
139 Returns
140 -------
141 driver.CUlaunchConfig
142 Native CUDA driver launch configuration
143 """
144 _lazy_init()
146 cdef object drv_cfg = driver.CUlaunchConfig()
147 cdef list attrs
148 cdef object attr
149 cdef object dim
150 cdef tuple grid_blocks
152 # Handle grid dimensions and cluster configuration
153 if config.cluster is not None:
154 # Convert grid from cluster units to block units
155 grid_blocks = (
156 config.grid[0] * config.cluster[0],
157 config.grid[1] * config.cluster[1],
158 config.grid[2] * config.cluster[2],
159 )
160 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = grid_blocks
162 # Set up cluster attribute
163 attr = driver.CUlaunchAttribute()
164 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
165 dim = attr.value.clusterDim
166 dim.x, dim.y, dim.z = config.cluster
167 attrs = [attr]
168 else:
169 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
170 attrs = []
172 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
173 drv_cfg.sharedMemBytes = config.shmem_size
175 if config.cooperative_launch:
176 attr = driver.CUlaunchAttribute()
177 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
178 attr.value.cooperative = 1
179 attrs.append(attr)
181 drv_cfg.numAttrs = len(attrs)
182 drv_cfg.attrs = attrs
184 return drv_cfg