Coverage for cuda / core / experimental / _launcher.pyx: 92%
38 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-10 01:19 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-10 01:19 +0000
1# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
5from cuda.core.experimental._stream cimport Stream_accept
8from cuda.core.experimental._kernel_arg_handler import ParamHolder
9from cuda.core.experimental._module import Kernel
10from cuda.core.experimental._stream import Stream
11from cuda.core.experimental._utils.clear_error_support import assert_type
12from cuda.core.experimental._utils.cuda_utils import (
13 _reduce_3_tuple,
14 check_or_create_options,
15 driver,
16 get_binding_version,
17 handle_return,
18)
20# TODO: revisit this treatment for py313t builds
21_inited = False
22_use_ex = None
25def _lazy_init():
26 global _inited
27 if _inited:
28 return
30 global _use_ex
31 # binding availability depends on cuda-python version
32 _py_major_minor = get_binding_version()
33 _driver_ver = handle_return(driver.cuDriverGetVersion())
34 _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
35 _inited = True
38def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args):
39 """Launches a :obj:`~_module.Kernel`
40 object with launch-time configuration.
42 Parameters
43 ----------
44 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
45 The stream establishing the stream ordering semantic of a
46 launch.
47 config : :obj:`LaunchConfig`
48 Launch configurations inline with options provided by
49 :obj:`~_launcher.LaunchConfig` dataclass.
50 kernel : :obj:`~_module.Kernel`
51 Kernel to launch.
52 *kernel_args : Any
53 Variable length argument list that is provided to the
54 launching kernel.
56 """
57 stream = Stream_accept(stream, allow_stream_protocol=True)
58 assert_type(kernel, Kernel)
59 _lazy_init()
60 config = check_or_create_options(LaunchConfig, config, "launch config")
62 # TODO: can we ensure kernel_args is valid/safe to use here?
63 # TODO: merge with HelperKernelParams?
64 kernel_args = ParamHolder(kernel_args)
65 args_ptr = kernel_args.ptr
67 # Note: CUkernel can still be launched via the old cuLaunchKernel and we do not care
68 # about the CUfunction/CUkernel difference (which depends on whether the "old" or
69 # "new" module loading APIs are in use). We check both binding & driver versions here
70 # mainly to see if the "Ex" API is available and if so we use it, as it's more feature
71 # rich.
72 if _use_ex:
73 drv_cfg = _to_native_launch_config(config)
74 drv_cfg.hStream = stream.handle
75 if config.cooperative_launch:
76 _check_cooperative_launch(kernel, config, stream)
77 handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
78 else:
79 # TODO: check if config has any unsupported attrs
80 handle_return(
81 driver.cuLaunchKernel(
82 int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
83 )
84 )
87cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
88 dev = stream.device
89 num_sm = dev.properties.multiprocessor_count
90 max_grid_size = (
91 kernel.occupancy.max_active_blocks_per_multiprocessor(_reduce_3_tuple(config.block), config.shmem_size) * num_sm
92 )
93 if _reduce_3_tuple(config.grid) > max_grid_size:
94 # For now let's try not to be smart and adjust the grid size behind users' back.
95 # We explicitly ask users to adjust.
96 x, y, z = config.grid
97 raise ValueError(f"The specified grid size ({x} * {y} * {z}) exceeds the limit ({max_grid_size})")