Coverage for cuda / core / _launcher.pyx: 88.89%

45 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-08 01:07 +0000

1# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4  

5from libc.stdint cimport uintptr_t 

6  

7from cuda.bindings cimport cydriver 

8  

9from cuda.core._launch_config cimport LaunchConfig 

10from cuda.core._kernel_arg_handler cimport ParamHolder 

11from cuda.core._module cimport Kernel 

12from cuda.core._resource_handles cimport as_cu 

13from cuda.core._stream cimport Stream_accept, Stream 

14from cuda.core._utils.cuda_utils cimport ( 

15 check_or_create_options, 

16 HANDLE_RETURN, 

17) 

18  

19import threading 

20  

21from cuda.core._module import Kernel 

22from cuda.core._stream import Stream 

23from cuda.core._utils.cuda_utils import ( 

24 _reduce_3_tuple, 

25 get_binding_version, 

26) 

27  

28  

29cdef bint _inited = False 

30cdef bint _use_ex = False 

31cdef object _lock = threading.Lock() 

32  

33  

34cdef int _lazy_init() except?-1: 

35 global _inited, _use_ex 

36 if _inited: 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

37 return 0 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

38  

39 cdef int _driver_ver 

40 with _lock: 1a

41 if _inited: 1a

42 return 0 

43  

44 # binding availability depends on cuda-python version 

45 _py_major_minor = get_binding_version() 1{a

46 HANDLE_RETURN(cydriver.cuDriverGetVersion(&_driver_ver)) 1a

47 _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) 1a

48 _inited = True 1a

49  

50 return 0 1a

51  

52  

53def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args): 

54 """Launches a :obj:`~_module.Kernel` 

55 object with launch-time configuration. 

56  

57 Parameters 

58 ---------- 

59 stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` 

60 The stream establishing the stream ordering semantic of a 

61 launch. 

62 config : :obj:`LaunchConfig` 

63 Launch configurations inline with options provided by 

64 :obj:`~_launcher.LaunchConfig` dataclass. 

65 kernel : :obj:`~_module.Kernel` 

66 Kernel to launch. 

67 *kernel_args : Any 

68 Variable length argument list that is provided to the 

69 launching kernel. 

70  

71 """ 

72 cdef Stream s = Stream_accept(stream, allow_stream_protocol=True) 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

73 _lazy_init() 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

74 cdef LaunchConfig conf = check_or_create_options(LaunchConfig, config, "launch config") 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

75  

76 # TODO: can we ensure kernel_args is valid/safe to use here? 

77 # TODO: merge with HelperKernelParams? 

78 cdef ParamHolder ker_args = ParamHolder(kernel_args) 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

79 cdef void** args_ptr = <void**><uintptr_t>(ker_args.ptr) 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

80  

81 # Note: We now use CUkernel handles exclusively (CUDA 12+), but they can be cast to 

82 # CUfunction for use with cuLaunchKernel, as both handle types are interchangeable 

83 # for kernel launch purposes. 

84 cdef Kernel ker = <Kernel>kernel 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

85 cdef cydriver.CUfunction func_handle = <cydriver.CUfunction>as_cu(ker._h_kernel) 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

86  

87 # Note: CUkernel can still be launched via cuLaunchKernel (not just cuLaunchKernelEx). 

88 # We check both binding & driver versions here mainly to see if the "Ex" API is 

89 # available and if so we use it, as it's more feature rich. 

90 if _use_ex: 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

91 drv_cfg = conf._to_native_launch_config() 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

92 drv_cfg.hStream = as_cu(s._h_stream) 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

93 if conf.cooperative_launch: 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

94 _check_cooperative_launch(kernel, conf, s) 1b

95 with nogil: 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

96 HANDLE_RETURN(cydriver.cuLaunchKernelEx(&drv_cfg, func_handle, args_ptr, NULL)) 1acdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYbZ0123456789!#$%'()*+,-./:;=?@[]^_`

97 else: 

98 # TODO: check if config has any unsupported attrs 

99 HANDLE_RETURN( 

100 cydriver.cuLaunchKernel( 

101 func_handle, 

102 conf.grid[0], conf.grid[1], conf.grid[2], 

103 conf.block[0], conf.block[1], conf.block[2], 

104 conf.shmem_size, as_cu(s._h_stream), args_ptr, NULL 

105 ) 

106 ) 

107  

108  

109cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream): 

110 dev = stream.device 1b

111 num_sm = dev.properties.multiprocessor_count 1b

112 max_grid_size = ( 

113 kernel.occupancy.max_active_blocks_per_multiprocessor(_reduce_3_tuple(config.block), config.shmem_size) * num_sm 1b

114 ) 

115 if _reduce_3_tuple(config.grid) > max_grid_size: 1{b

116 # For now let's try not to be smart and adjust the grid size behind users' back. 

117 # We explicitly ask users to adjust. 

118 x, y, z = config.grid 1b

119 raise ValueError(f"The specified grid size ({x} * {y} * {z}) exceeds the limit ({max_grid_size})") 1b