Coverage for cuda / core / experimental / _launch_config.pyx: 72%

60 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-10 01:19 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from cuda.core.experimental._device import Device 

6from cuda.core.experimental._utils.cuda_utils import ( 

7 CUDAError, 

8 cast_to_3_tuple, 

9 driver, 

10 get_binding_version, 

11 handle_return, 

12) 

13  

14# TODO: revisit this treatment for py313t builds 

15cdef bint _inited = False 

16cdef bint _use_ex = False 

17  

18  

19cdef void _lazy_init() except *: 

20 """Initialize module-level globals for driver version checks.""" 

21 global _inited, _use_ex 

22 if _inited: 

23 return 

24  

25 cdef tuple _py_major_minor 

26 cdef int _driver_ver 

27  

28 # binding availability depends on cuda-python version 

29 _py_major_minor = get_binding_version() 

30 _driver_ver = handle_return(driver.cuDriverGetVersion()) 

31 _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) 

32 _inited = True 

33  

34  

35cdef class LaunchConfig: 

36 """Customizable launch options. 

37  

38 Note 

39 ---- 

40 When cluster is specified, the grid parameter represents the number of 

41 clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) -> 

42 block (threads). Each dimension in grid specifies clusters in the grid, each dimension in 

43 cluster specifies blocks per cluster, and each dimension in block specifies 

44 threads per block. 

45  

46 Attributes 

47 ---------- 

48 grid : Union[tuple, int] 

49 Collection of threads that will execute a kernel function. When cluster 

50 is not specified, this represents the number of blocks, otherwise 

51 this represents the number of clusters. 

52 cluster : Union[tuple, int] 

53 Group of blocks (Thread Block Cluster) that will execute on the same 

54 GPU Processing Cluster (GPC). Blocks within a cluster have access to 

55 distributed shared memory and can be explicitly synchronized. 

56 block : Union[tuple, int] 

57 Group of threads (Thread Block) that will execute on the same 

58 streaming multiprocessor (SM). Threads within a thread blocks have 

59 access to shared memory and can be explicitly synchronized. 

60 shmem_size : int, optional 

61 Dynamic shared-memory size per thread block in bytes. 

62 (Default to size 0) 

63 cooperative_launch : bool, optional 

64 Whether this config can be used to launch a cooperative kernel. 

65 """ 

66  

67 # TODO: expand LaunchConfig to include other attributes 

68 # Note: attributes are declared in _launch_config.pxd 

69  

70 def __init__(self, grid=None, cluster=None, block=None, 

71 shmem_size=None, cooperative_launch=False): 

72 """Initialize LaunchConfig with validation. 

73  

74 Parameters 

75 ---------- 

76 grid : Union[tuple, int], optional 

77 Grid dimensions (number of blocks or clusters if cluster is specified) 

78 cluster : Union[tuple, int], optional 

79 Cluster dimensions (Thread Block Cluster) 

80 block : Union[tuple, int], optional 

81 Block dimensions (threads per block) 

82 shmem_size : int, optional 

83 Dynamic shared memory size in bytes (default: 0) 

84 cooperative_launch : bool, optional 

85 Whether to launch as cooperative kernel (default: False) 

86 """ 

87 _lazy_init() 

88  

89 # Convert and validate grid and block dimensions 

90 self.grid = cast_to_3_tuple("LaunchConfig.grid", grid) 

91 self.block = cast_to_3_tuple("LaunchConfig.block", block) 

92  

93 # FIXME: Calling Device() strictly speaking is not quite right; we should instead 

94 # look up the device from stream. We probably need to defer the checks related to 

95 # device compute capability or attributes. 

96 # thread block clusters are supported starting H100 

97 if cluster is not None: 

98 if not _use_ex: 

99 err, drvers = driver.cuDriverGetVersion() 

100 drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else "" 

101 raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}") 

102 cc = Device().compute_capability 

103 if cc < (9, 0): 

104 raise CUDAError( 

105 f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" 

106 ) 

107 self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster) 

108 else: 

109 self.cluster = None 

110  

111 # Handle shmem_size default 

112 if shmem_size is None: 

113 self.shmem_size = 0 

114 else: 

115 self.shmem_size = shmem_size 

116  

117 # Handle cooperative_launch 

118 self.cooperative_launch = cooperative_launch 

119  

120 # Validate cooperative launch support 

121 if self.cooperative_launch and not Device().properties.cooperative_launch: 

122 raise CUDAError("cooperative kernels are not supported on this device") 

123  

124 def __repr__(self): 

125 """Return string representation of LaunchConfig.""" 

126 return (f"LaunchConfig(grid={self.grid}, cluster={self.cluster}, " 

127 f"block={self.block}, shmem_size={self.shmem_size}, " 

128 f"cooperative_launch={self.cooperative_launch})") 

129  

130  

131cpdef object _to_native_launch_config(LaunchConfig config): 

132 """Convert LaunchConfig to native driver CUlaunchConfig. 

133  

134 Parameters 

135 ---------- 

136 config : LaunchConfig 

137 High-level launch configuration 

138  

139 Returns 

140 ------- 

141 driver.CUlaunchConfig 

142 Native CUDA driver launch configuration 

143 """ 

144 _lazy_init() 

145  

146 cdef object drv_cfg = driver.CUlaunchConfig() 

147 cdef list attrs 

148 cdef object attr 

149 cdef object dim 

150 cdef tuple grid_blocks 

151  

152 # Handle grid dimensions and cluster configuration 

153 if config.cluster is not None: 

154 # Convert grid from cluster units to block units 

155 grid_blocks = ( 

156 config.grid[0] * config.cluster[0], 

157 config.grid[1] * config.cluster[1], 

158 config.grid[2] * config.cluster[2], 

159 ) 

160 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = grid_blocks 

161  

162 # Set up cluster attribute 

163 attr = driver.CUlaunchAttribute() 

164 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION 

165 dim = attr.value.clusterDim 

166 dim.x, dim.y, dim.z = config.cluster 

167 attrs = [attr] 

168 else: 

169 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid 

170 attrs = [] 

171  

172 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block 

173 drv_cfg.sharedMemBytes = config.shmem_size 

174  

175 if config.cooperative_launch: 

176 attr = driver.CUlaunchAttribute() 

177 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE 

178 attr.value.cooperative = 1 

179 attrs.append(attr) 

180  

181 drv_cfg.numAttrs = len(attrs) 

182 drv_cfg.attrs = attrs 

183  

184 return drv_cfg