Coverage for cuda / core / experimental / _launcher.pyx: 92%

38 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-10 01:19 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config 

5from cuda.core.experimental._stream cimport Stream_accept 

6  

7  

8from cuda.core.experimental._kernel_arg_handler import ParamHolder 

9from cuda.core.experimental._module import Kernel 

10from cuda.core.experimental._stream import Stream 

11from cuda.core.experimental._utils.clear_error_support import assert_type 

12from cuda.core.experimental._utils.cuda_utils import ( 

13 _reduce_3_tuple, 

14 check_or_create_options, 

15 driver, 

16 get_binding_version, 

17 handle_return, 

18) 

19  

20# TODO: revisit this treatment for py313t builds 

21_inited = False 

22_use_ex = None 

23  

24  

25def _lazy_init(): 

26 global _inited 

27 if _inited: 

28 return 

29  

30 global _use_ex 

31 # binding availability depends on cuda-python version 

32 _py_major_minor = get_binding_version() 

33 _driver_ver = handle_return(driver.cuDriverGetVersion()) 

34 _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) 

35 _inited = True 

36  

37  

38def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args): 

39 """Launches a :obj:`~_module.Kernel` 

40 object with launch-time configuration. 

41  

42 Parameters 

43 ---------- 

44 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` 

45 The stream establishing the stream ordering semantic of a 

46 launch. 

47 config : :obj:`LaunchConfig` 

48 Launch configurations inline with options provided by 

49 :obj:`~_launcher.LaunchConfig` dataclass. 

50 kernel : :obj:`~_module.Kernel` 

51 Kernel to launch. 

52 *kernel_args : Any 

53 Variable length argument list that is provided to the 

54 launching kernel. 

55  

56 """ 

57 stream = Stream_accept(stream, allow_stream_protocol=True) 

58 assert_type(kernel, Kernel) 

59 _lazy_init() 

60 config = check_or_create_options(LaunchConfig, config, "launch config") 

61  

62 # TODO: can we ensure kernel_args is valid/safe to use here? 

63 # TODO: merge with HelperKernelParams? 

64 kernel_args = ParamHolder(kernel_args) 

65 args_ptr = kernel_args.ptr 

66  

67 # Note: CUkernel can still be launched via the old cuLaunchKernel and we do not care 

68 # about the CUfunction/CUkernel difference (which depends on whether the "old" or 

69 # "new" module loading APIs are in use). We check both binding & driver versions here 

70 # mainly to see if the "Ex" API is available and if so we use it, as it's more feature 

71 # rich. 

72 if _use_ex: 

73 drv_cfg = _to_native_launch_config(config) 

74 drv_cfg.hStream = stream.handle 

75 if config.cooperative_launch: 

76 _check_cooperative_launch(kernel, config, stream) 

77 handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0)) 

78 else: 

79 # TODO: check if config has any unsupported attrs 

80 handle_return( 

81 driver.cuLaunchKernel( 

82 int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0 

83 ) 

84 ) 

85  

86  

87cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream): 

88 dev = stream.device 

89 num_sm = dev.properties.multiprocessor_count 

90 max_grid_size = ( 

91 kernel.occupancy.max_active_blocks_per_multiprocessor(_reduce_3_tuple(config.block), config.shmem_size) * num_sm 

92 ) 

93 if _reduce_3_tuple(config.grid) > max_grid_size: 

94 # For now let's try not to be smart and adjust the grid size behind users' back. 

95 # We explicitly ask users to adjust. 

96 x, y, z = config.grid 

97 raise ValueError(f"The specified grid size ({x} * {y} * {z}) exceeds the limit ({max_grid_size})")