Coverage for cuda/core/experimental/_launch

3# SPDX-License-Identifier: Apache-2.0

5from cuda.core.experimental._device import Device

6from cuda.core.experimental._utils.cuda_utils import (

7 CUDAError,

8 cast_to_3_tuple,

9 driver,

10 get_binding_version,

11 handle_return,

12)

14# TODO: revisit this treatment for py313t builds

15cdef bint _inited = False

16cdef bint _use_ex = False

19cdef void _lazy_init() except *:

20 """Initialize module-level globals for driver version checks."""

21 global _inited, _use_ex

22 if _inited:

23 return

25 cdef tuple _py_major_minor

26 cdef int _driver_ver

28 # binding availability depends on cuda-python version

29 _py_major_minor = get_binding_version()

30 _driver_ver = handle_return(driver.cuDriverGetVersion())

31 _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))

32 _inited = True

35cdef class LaunchConfig:

36 """Customizable launch options.

38 Note

39 ----

40 When cluster is specified, the grid parameter represents the number of

41 clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) ->

42 block (threads). Each dimension in grid specifies clusters in the grid, each dimension in

43 cluster specifies blocks per cluster, and each dimension in block specifies

44 threads per block.

46 Attributes

47 ----------

48 grid : Union[tuple, int]

49 Collection of threads that will execute a kernel function. When cluster

50 is not specified, this represents the number of blocks, otherwise

51 this represents the number of clusters.

52 cluster : Union[tuple, int]

53 Group of blocks (Thread Block Cluster) that will execute on the same

54 GPU Processing Cluster (GPC). Blocks within a cluster have access to

55 distributed shared memory and can be explicitly synchronized.

56 block : Union[tuple, int]

57 Group of threads (Thread Block) that will execute on the same

58 streaming multiprocessor (SM). Threads within a thread blocks have

59 access to shared memory and can be explicitly synchronized.

60 shmem_size : int, optional

61 Dynamic shared-memory size per thread block in bytes.

62 (Default to size 0)

63 cooperative_launch : bool, optional

64 Whether this config can be used to launch a cooperative kernel.

65 """

67 # TODO: expand LaunchConfig to include other attributes

68 # Note: attributes are declared in _launch_config.pxd

70 def __init__(self, grid=None, cluster=None, block=None,

71 shmem_size=None, cooperative_launch=False):

72 """Initialize LaunchConfig with validation.

74 Parameters

75 ----------

76 grid : Union[tuple, int], optional

77 Grid dimensions (number of blocks or clusters if cluster is specified)

78 cluster : Union[tuple, int], optional

79 Cluster dimensions (Thread Block Cluster)

80 block : Union[tuple, int], optional

81 Block dimensions (threads per block)

82 shmem_size : int, optional

83 Dynamic shared memory size in bytes (default: 0)

84 cooperative_launch : bool, optional

85 Whether to launch as cooperative kernel (default: False)

86 """

87 _lazy_init()

89 # Convert and validate grid and block dimensions

90 self.grid = cast_to_3_tuple("LaunchConfig.grid", grid)

91 self.block = cast_to_3_tuple("LaunchConfig.block", block)

93 # FIXME: Calling Device() strictly speaking is not quite right; we should instead

94 # look up the device from stream. We probably need to defer the checks related to

95 # device compute capability or attributes.

96 # thread block clusters are supported starting H100

97 if cluster is not None:

98 if not _use_ex:

99 err, drvers = driver.cuDriverGetVersion()

100 drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else ""

101 raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}")

102 cc = Device().compute_capability

103 if cc < (9, 0):

104 raise CUDAError(

105 f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"

106 )

107 self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster)

108 else:

109 self.cluster = None

110

111 # Handle shmem_size default

112 if shmem_size is None:

113 self.shmem_size = 0

114 else:

115 self.shmem_size = shmem_size

116

117 # Handle cooperative_launch

118 self.cooperative_launch = cooperative_launch

119

120 # Validate cooperative launch support

121 if self.cooperative_launch and not Device().properties.cooperative_launch:

122 raise CUDAError("cooperative kernels are not supported on this device")

123

124 def __repr__(self):

125 """Return string representation of LaunchConfig."""

126 return (f"LaunchConfig(grid={self.grid}, cluster={self.cluster}, "

127 f"block={self.block}, shmem_size={self.shmem_size}, "

128 f"cooperative_launch={self.cooperative_launch})")

129

130

131cpdef object _to_native_launch_config(LaunchConfig config):

132 """Convert LaunchConfig to native driver CUlaunchConfig.

133

134 Parameters

135 ----------

136 config : LaunchConfig

137 High-level launch configuration

138

139 Returns

140 -------

141 driver.CUlaunchConfig

142 Native CUDA driver launch configuration

143 """

144 _lazy_init()

145

146 cdef object drv_cfg = driver.CUlaunchConfig()

147 cdef list attrs

148 cdef object attr

149 cdef object dim

150 cdef tuple grid_blocks

151

152 # Handle grid dimensions and cluster configuration

153 if config.cluster is not None:

154 # Convert grid from cluster units to block units

155 grid_blocks = (

156 config.grid[0] * config.cluster[0],

157 config.grid[1] * config.cluster[1],

158 config.grid[2] * config.cluster[2],

159 )

160 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = grid_blocks

161

162 # Set up cluster attribute

163 attr = driver.CUlaunchAttribute()

164 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION

165 dim = attr.value.clusterDim

166 dim.x, dim.y, dim.z = config.cluster

167 attrs = [attr]

168 else:

169 drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid

170 attrs = []

171

172 drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block

173 drv_cfg.sharedMemBytes = config.shmem_size

174

175 if config.cooperative_launch:

176 attr = driver.CUlaunchAttribute()

177 attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE

178 attr.value.cooperative = 1

179 attrs.append(attr)

180

181 drv_cfg.numAttrs = len(attrs)

182 drv_cfg.attrs = attrs

183

184 return drv_cfg

Coverage for cuda / core / experimental / _launch_config.pyx: 72%

60 statements