Runtime

class tensorrt_llm.runtime.ChatGLMGenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream = None)[source]

Bases: GenerationSession

class tensorrt_llm.runtime.EncDecModelRunner(engine_name, engine_dir, lora_dir=None, lora_task_uids=None, debug_mode=False, skip_encoder=False, stream: Stream = None, enable_context_fmha_fp32_acc: bool = None)[source]

Bases: object

encoder_run(input_ids, input_lengths, max_input_length, position_ids=None, token_type_ids=None, debug_mode=False, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, attention_mask=None)[source]
classmethod from_engine(engine_name, engine_dir, lora_dir=None, lora_task_uids=None, debug_mode=False, skip_encoder=False, stream=None, enable_context_fmha_fp32_acc=None)[source]
generate(encoder_input_ids, decoder_input_ids, max_new_tokens, num_beams=1, pad_token_id=None, eos_token_id=None, bos_token_id=None, debug_mode=False, return_dict=False, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, attention_mask=None, time_encoder=False, return_encoder_output=False)[source]
process_input(input_ids, remove_input_padding=False, pad_token_id=0, prompt_tasks=None)[source]
class tensorrt_llm.runtime.GenerationSequence(seq_idx, batch_idx)[source]

Bases: object

get_batch_idx() int[source]

Returns idx of sequence in batch

get_seq_idx() int[source]

Returns sequence idx

class tensorrt_llm.runtime.GenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream = None)[source]

Bases: object

batch_size: int
buffer_allocated: bool
property context_mem_size: int
property conv_kernel
property cross_attention
cuda_graph_mode: bool
cuda_stream_guard()[source]

Sync external stream and set current stream to the one bound to the session. Reset on exit.

debug_mode: bool
debug_tensors_to_save: None
decode(input_ids: Tensor, context_lengths: Tensor, sampling_config: SamplingConfig, prompt_embedding_table: Tensor = None, tasks: Tensor = None, prompt_vocab_size: Tensor = None, stop_words_list=None, bad_words_list=None, streaming: bool = False, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor = None, encoder_input_lengths: Tensor = None, stopping_criteria: StoppingCriteria = None, logits_processor: LogitsProcessor = None, cross_attention_mask: List[Tensor] = None, **kwargs)[source]
decode_batch(input_ids: Sequence[Tensor], sampling_config: SamplingConfig, streaming: bool = False, **kwargs)[source]
decode_regular(*, batch_size: int, scfg: SamplingConfig, sequence_lengths: Tensor, context_lengths: Tensor, host_context_lengths, max_context_length: int, beam_width: int, cache_indirections: list, input_ids: Tensor, hidden_states: Tensor, prompt_embedding_table: Tensor, tasks: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, stop_words_data, bad_words_data, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor = None, encoder_input_lengths: Tensor = None, stopping_criteria: StoppingCriteria = None, logits_processor: LogitsProcessor = None, cross_attention_mask: List[Tensor] = None, **kwargs)[source]
decode_stream(*, batch_size: int, scfg: SamplingConfig, sequence_lengths: Tensor, context_lengths: Tensor, host_context_lengths, max_context_length: int, beam_width: int, cache_indirections: list, input_ids: Tensor, hidden_states: Tensor, prompt_embedding_table: Tensor, tasks: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, stop_words_data, bad_words_data, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor = None, encoder_input_lengths: Tensor = None, stopping_criteria: StoppingCriteria = None, logits_processor: LogitsProcessor = None, cross_attention_mask: List[Tensor] = None, **kwargs)[source]
device: device
property dtype
dump_debug_buffers(step: int) None[source]
early_stop_criteria(batch_size, step, should_stop)[source]
property engine_inspector
filter_medusa_logits(batch_size, best_path, best_path_lengths, medusa_logits)[source]

medusa_logits is of shape [nMH, bs, nMT+1, vocab]

Returns [nMH, bs, vocab]

finalize_decoder(context_lengths, batch_size, beam_width, scfg, in_progress=False)[source]
find_best_medusa_path(batch_size, input_ids: Tensor, next_logits, temp=0)[source]
property first_layer
property gather_context_logits
property gather_generation_logits
get_next_medusa_tokens(batch_size, next_medusa_logits)[source]
get_num_heads_kv(layer_idx: int | None = None) int[source]
handle_per_step(*, cache_indirections: list, step: int, batch_size: int, max_context_length: int, beam_width: int, input_ids: Tensor, hidden_states: Tensor, scfg: SamplingConfig, kv_cache_block_offsets: Tensor, host_kv_cache_block_offsets: Tensor, cross_kv_cache_block_offsets: Tensor, host_cross_kv_cache_block_offsets: Tensor, prompt_embedding_table: Tensor, tasks: Tensor, context_lengths: Tensor, host_context_lengths, attention_mask: Tensor, cross_attention_mask_for_context: Tensor, cross_attention_mask_for_gen: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, sequence_lengths: Tensor, next_step_tensors: Dict[str, RuntimeTensor], stop_words_data, bad_words_data, encoder_output: Tensor, encoder_input_lengths: Tensor, stopping_criteria: StoppingCriteria, logits_processor: LogitsProcessor, **kwargs)[source]
property has_position_embedding
property has_token_type_embedding
property head_size
property hidden_size
property is_medusa_mode
property is_redrafter_mode
property kv_cache_type
property last_layer
locate_accepted_draft_tokens(batch_size, best_path, best_path_len, draft_paths)[source]
mapping: Mapping
property max_draft_tokens
property max_prompt_embedding_table_size
medusa_decode_and_verify(step, batch_size, logits)[source]
medusa_paths: List[List[int]] = None
medusa_position_offsets: List[int] = None
medusa_temperature: float = 0.0
medusa_topks: List[int] = None
medusa_tree_ids: List[int] = None
next_medusa_input_ids()[source]
num_draft_tokens: int = 0
property num_heads
property num_layers
property num_medusa_heads
property paged_kv_cache
property paged_state
pp_communicate_final_output_ids(final_output_ids, batch_size, beam_width)[source]
pp_communicate_new_tokens(should_stop, cache_indir, sequence_length)[source]
process_logits_including_draft(step, batch_size, logits, next_step_buffer)[source]
  1. Process logits to tokens and validate (Medusa) or process outputs (ReDrafter)

  2. Extract early stop criteria here : self.accept_length

  3. Update output ids : needs self.new_tokens and past_sequence_length

  4. Get next input_ids : self.[new_tokens, accept_lengths, medusa_output_tokens]

  5. Update KV cache : self.[sequence_length, num_draft_tokens]

  6. Update sequence_length_buffer and past_kv_length

property profiler
property quant_mode
property remove_input_padding
property rnn_conv_dim_size
property rnn_head_size
property rnn_hidden_size
runtime: _Runtime
setup(batch_size: int, max_context_length: int, max_new_tokens: int, beam_width: int = 1, max_attention_window_size: int | None = None, sink_token_length: int | None = None, encoder_max_input_length: int | None = None, lora_manager: LoraManager = None, lora_uids: List[str] = None, medusa_choices: List[List[int]] = None, multi_block_mode: bool = True, enable_context_fmha_fp32_acc: bool = None)[source]
property state_dtype
property state_size
property tokens_per_block
update_output_ids_by_offset(new_generated_ids, offsets)[source]
property use_gpt_attention_plugin
property use_kv_cache
property use_lora_plugin
property use_mamba_conv1d_plugin
property vocab_size
class tensorrt_llm.runtime.KVCacheManager(*, num_layers: int, num_blocks: int, block_size: int, tokens_per_block: int, max_blocks_per_seq: int, max_attention_window_size: int, sink_token_len: int, beam_width: int = 1, use_one_more_block: bool = False)[source]

Bases: object

add_sequence(sequence: GenerationSequence, context_len: int, always_share_across_beam: bool = False)[source]

Add sequence to the manager and allocate minimum amount of blocks for context

get_block_offsets(beam_width: int) Tensor[source]

Returns array of offsets into memory pools

step(finished: List[bool])[source]

Iterate to the next generation step. Add new blocks where needed and clear finished sequences.

class tensorrt_llm.runtime.LogitsProcessor[source]

Bases: object

Base class for all logit processors that can be applied during generation.

class tensorrt_llm.runtime.LogitsProcessorList(iterable=(), /)[source]

Bases: list, LogitsProcessor

class tensorrt_llm.runtime.ModelConfig(max_batch_size: int, max_beam_width: int, vocab_size: int, num_layers: int, num_heads: int, num_kv_heads: int, hidden_size: int, gpt_attention_plugin: bool, remove_input_padding: bool = False, model_name: str = '', kv_cache_type: tensorrt_llm.bindings.KVCacheType = <KVCacheType.CONTINUOUS: 0>, cross_attention: bool = False, head_size: int = None, has_position_embedding: bool = True, has_token_type_embedding: bool = False, tokens_per_block: int = 64, max_prompt_embedding_table_size: int = 0, quant_mode: tensorrt_llm.quantization.mode.QuantMode = <QuantMode: 0>, gather_context_logits: bool = False, gather_generation_logits: bool = False, dtype: str = '', lora_plugin: bool = False, lora_target_modules: List[str] = <factory>, trtllm_modules_to_hf_modules: dict = None, skip_cross_kv: bool = False, num_medusa_heads: int = 0, max_medusa_tokens: int = 0, paged_state: bool = True, mamba_conv1d_plugin: bool = True, conv_kernel: int = 0, layer_types: List[str] = <factory>, rnn_hidden_size: int = 0, rnn_head_size: int = 0, rnn_conv_dim_size: int = 0, state_size: int = 0, state_dtype: str = '', gpu_weights_percent: float = 1.0, redrafter_num_beams: int = 0, redrafter_draft_len_per_beam: int = 0, num_kv_heads_per_layer: Optional[List[int]] = None, num_kv_heads_per_cross_attn_layer: Optional[List[int]] = None, skip_cross_attn_blocks: bool = False)[source]

Bases: object

conv_kernel: int = 0
cross_attention: bool = False
dtype: str = ''
gather_context_logits: bool = False
gather_generation_logits: bool = False
gpt_attention_plugin: bool
gpu_weights_percent: float = 1.0
has_position_embedding: bool = True
has_token_type_embedding: bool = False
head_size: int = None
hidden_size: int
kv_cache_type: KVCacheType = <KVCacheType.CONTINUOUS: 0>
layer_types: List[str]
lora_plugin: bool = False
lora_target_modules: List[str]
mamba_conv1d_plugin: bool = True
max_batch_size: int
max_beam_width: int
max_medusa_tokens: int = 0
max_prompt_embedding_table_size: int = 0
model_name: str = ''
num_heads: int
num_kv_heads: int
num_kv_heads_per_cross_attn_layer: List[int] | None = None
num_kv_heads_per_layer: List[int] | None = None
num_layers: int
num_medusa_heads: int = 0
paged_state: bool = True
quant_mode: QuantMode = 0
redrafter_draft_len_per_beam: int = 0
redrafter_num_beams: int = 0
remove_input_padding: bool = False
rnn_conv_dim_size: int = 0
rnn_head_size: int = 0
rnn_hidden_size: int = 0
skip_cross_attn_blocks: bool = False
skip_cross_kv: bool = False
state_dtype: str = ''
state_size: int = 0
tokens_per_block: int = 64
trtllm_modules_to_hf_modules: dict = None
vocab_size: int
class tensorrt_llm.runtime.ModelRunner(session: GenerationSession, max_batch_size: int, max_input_len: int, max_seq_len: int, max_beam_width: int, kv_cache_type: KVCacheType, lora_manager: LoraManager | None = None)[source]

Bases: ModelRunnerMixin

An interface class that wraps GenerationSession and provides generation methods.

property dtype: dtype
classmethod from_dir(engine_dir: str, *, max_output_len: int | None = None, lora_dir: List[str] | None = None, rank: int = 0, debug_mode: bool = False, lora_ckpt_source: str = 'hf', medusa_choices: List[List[int]] = None, stream: Stream = None, gpu_weights_percent: float = 1, enable_context_fmha_fp32_acc: bool | None = None, multi_block_mode: bool | None = None) ModelRunner[source]

Create a ModelRunner instance from an engine directory.

Parameters:
  • engine_dir (str) – The directory that contains the serialized engine files and config files.

  • max_output_len (Optional[int]) – max_output_len, this arg might be available only when loading time, generate will still to check when disable_kv_cache is enabled.

  • lora_dir (Optional[List[str]]) – The directories that contain LoRA weights.

  • rank (int) – The runtime rank id.

  • debug_mode (bool) – Whether or not to turn on the debug mode.

  • medusa_choices (List[List[int]]) – Medusa choices to use when in Medusa decoding

  • stream (torch.cuda.Stream) – Stream to use.

  • multi_block_mode (bool) – Whether to distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.

Returns:

An instance of ModelRunner.

Return type:

ModelRunner

classmethod from_engine(engine: Engine, *, max_output_len: int | None, lora_dir: List[str] | None, rank: int, debug_mode: bool, lora_ckpt_source: str, medusa_choices: List[List[int]], stream: Stream, gpu_weights_percent: float, enable_context_fmha_fp32_acc: bool | None, multi_block_mode: bool | None) ModelRunner[source]
property gather_context_logits: bool
property gather_generation_logits: bool
generate(batch_input_ids: List[Tensor], position_ids: List[Tensor] = None, sampling_config: SamplingConfig | None = None, prompt_table: str | Tensor | None = None, prompt_tasks: str | None = None, lora_uids: list | None = None, streaming: bool = False, stopping_criteria: StoppingCriteria | None = None, logits_processor: LogitsProcessor | None = None, medusa_choices: List[List[int]] | None = None, encoder_max_input_length: int = None, encoder_input_features: List[Tensor] = None, encoder_output_lengths: List[Tensor] = None, cross_attention_masks: List[Tensor] = None, **kwargs) Tensor | dict[source]

Generates sequences of token ids. The generation-controlling parameters are set in the sampling_config; it will be set to a default one if not passed. You can override any sampling_config’s attributes by passing corresponding parameters.

Parameters:
  • batch_input_ids (List[torch.Tensor]) – A list of input id tensors. Each tensor is of shape (sequence_length, ).

  • sampling_config (SamplingConfig) – The sampling configuration to be used as base parametrization for the generation call. The passed **kwargs matching the sampling_config’s attributes will override them. If the sampling_config is not provided, a default will be used.

  • prompt_table (str or torch.Tensor) – The file path of prompt table (.npy format, exported by nemo_prompt_convert.py) or the prompt table itself.

  • prompt_tasks (str) – The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0).

  • lora_uids (list) – The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module.

  • streaming (bool) – Whether or not to use streaming mode for generation.

  • stopping_criteria (StoppingCriteria) – Custom stopping criteria.

  • logits_processor (LogitsProcessor) – Custom logits processors.

  • medusa_choices (List[List[int]]) – Medusa decoding choices.

  • (Dict[str (kwargs) – Ad hoc parametrization of sampling_config. The passed **kwargs matching the sampling_config’s attributes will override them.

  • Any] – Ad hoc parametrization of sampling_config. The passed **kwargs matching the sampling_config’s attributes will override them.

Returns:

If return_dict=False, the method returns generated output_ids. If return_dict=True, the method returns a dict of output_ids, sequence_lengths (if sampling_config.output_sequence_lengths=True), context_logits and generation_logits (if self.gather_context_logits=True and self.gather_generation_logits=True, respectively).

Return type:

torch.Tensor or dict

property hidden_size: int
property mapping: Mapping
property max_prompt_embedding_table_size: int
property max_sequence_length: int
property num_heads: int
property num_layers: int
property remove_input_padding: bool
serialize_engine() IHostMemory[source]

Serialize the engine.

Returns:

The serialized engine.

Return type:

bytes

property use_lora_plugin: bool
property vocab_size: int
property vocab_size_padded: int
class tensorrt_llm.runtime.ModelRunnerCpp(executor: Executor, max_batch_size: int, max_input_len: int, max_seq_len: int, max_beam_width: int, model_config: ModelConfig, world_config: WorldConfig, use_kv_cache: bool, lora_manager: LoraManager | None = None)[source]

Bases: ModelRunnerMixin

An interface class that wraps Executor and provides generation methods.

property dtype: dtype
classmethod from_dir(engine_dir: str, *, lora_dir: str | None = None, rank: int = 0, max_batch_size: int | None = None, max_input_len: int | None = None, max_output_len: int | None = None, max_beam_width: int | None = None, max_attention_window_size: list[int] | None = None, sink_token_length: int | None = None, kv_cache_free_gpu_memory_fraction: float | None = None, cross_kv_cache_fraction: float | None = None, medusa_choices: list[list[int]] | None = None, eagle_choices: list[list[int]] | None = None, eagle_posterior_threshold: float | None = None, lookahead_config: list[int] | None = None, debug_mode: bool = False, lora_ckpt_source: str = 'hf', gpu_weights_percent: float = 1, max_tokens_in_paged_kv_cache: int | None = None, kv_cache_enable_block_reuse: bool = False, enable_chunked_context: bool = False, is_enc_dec: bool = False, multi_block_mode: bool = True, enable_context_fmha_fp32_acc: bool | None = None, cuda_graph_mode: bool | None = None, logits_processor_map: Dict[str, LogitsProcessor] | None = None, device_ids: List[int] | None = None, is_orchestrator_mode: bool = False, use_runtime_defaults: bool = True, backend: str | None = None, py_executor_config: dict = {}) ModelRunnerCpp[source]

Create a ModelRunnerCpp instance from an engine directory.

Parameters:
  • engine_dir (str) – The directory that contains the serialized engine files and config files.

  • lora_dir (str) – The directory that contains LoRA weights.

  • rank (int) – The runtime rank id.

  • max_batch_size (int) – The runtime batch size limit. If max_batch_size is not None, it should not be larger than the engine’s max_batch_size; otherwise, the engine’s max_batch_size will be used.

  • max_input_len (int) – The runtime input length limit. If max_input_len is not None, it should not be larger than the engine’s max_input_len; otherwise, the engine’s max_input_len will be used.

  • max_output_len (int) – The runtime output length limit. If max_output_len is not None, it should not be larger than the engine’s max_output_len; otherwise, the engine’s max_output_len will be used.

  • max_beam_width (int) – The runtime beam width limit. If max_beam_width is not None, it should not be larger than the engine’s max_beam_width; otherwise, the engine’s max_beam_width will be used.

  • max_attention_window_size (List[int]) – The attention window size that controls the sliding window attention / cyclic kv cache behavior.

  • sink_token_length (int) – The sink token length, default=0.

  • kv_cache_free_gpu_memory_fraction (float) – Free GPU memory fraction that KV cache used.

  • cross_kv_cache_fraction (float) – KV Cache fraction reserved for cross attention, should only be used with enc-dec models.

  • debug_mode (bool) – Whether or not to turn on the debug mode.

  • medusa_choices (List[List[int]]) – Medusa choices to use when in Medusa decoding.

  • eagle_choices (List[List[int]]) – Eagle choices to use when in Eagle-1 decoding.

  • float (eagle_posterior_threshold) – Minimum token probability threshold for typical acceptance. Value different from None enables typical acceptance in Eagle.

  • lora_ckpt_source (str) – Source of checkpoint. Should be one of [‘hf’, ‘nemo’].

  • max_tokens_in_paged_kv_cache (int) – Maximum amount of tokens configured in kv cache.

  • kv_cache_enable_block_reuse (bool) – Enables block reuse in kv cache.

  • enable_chunked_context (bool) – Enables chunked context.

  • is_enc_dec (bool) – Whether the model is encoder-decoder architecture.

  • multi_block_mode (bool) – Whether to distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.

  • enable_context_fmha_fp32_acc (bool) – Enable FMHA runner FP32 accumulation.

  • cuda_graph_mode (bool) – Whether to use cuda graph for inference.

  • logits_processor_map (Dict[str, LogitsProcessor]) – A map of logits processor functions indexed by names. A name can be provided later to the generate() function to specify which logits processor to run.

  • device_ids (List[int]) – Device indices to run the Executor on.

  • is_orchestrator_mode (bool) – The mode to run the model-runner, Leader mode by default.

Returns:

An instance of ModelRunnerCpp.

Return type:

ModelRunnerCpp

property gather_context_logits: bool
property gather_generation_logits: bool
generate(batch_input_ids: List[Tensor], *, position_ids: List[Tensor] = None, encoder_input_ids: List[Tensor] = None, encoder_input_features: List[Tensor] = None, encoder_output_lengths: List[int] = None, cross_attention_masks: List[Tensor] = None, mrope_params: MropeParams | None = None, sampling_config: SamplingConfig | None = None, lora_uids: list | None = None, lookahead_config: list[int] | None = None, streaming: bool = False, stopping_criteria: StoppingCriteria | None = None, logits_processor_names: list[str] | None = None, max_new_tokens: int = 1, end_id: int | None = None, pad_id: int | None = None, bad_words_list: list[list[int]] | None = None, stop_words_list: list[list[int]] | None = None, return_dict: bool = False, output_sequence_lengths: bool = False, output_log_probs: bool = False, output_cum_log_probs: bool = False, prompt_table: str | Tensor | None = None, prompt_tasks: str | None = None, input_token_extra_ids: List[List[int]] = None, return_all_generated_tokens: bool = False, **kwargs) Tensor | dict[source]

Generates sequences of token ids. The generation-controlling parameters are set in the sampling_config; it will be set to a default one if not passed. You can override any sampling_config’s attributes by passing corresponding parameters.

Parameters:
  • batch_input_ids (List[torch.Tensor]) – A list of input id tensors. Each tensor is of shape (sequence_length, ).

  • position_ids (List[torch.Tensor]) – A list of position id tensors. Each tensor is of shape (sequence_length, ).

  • encoder_input_ids (List[torch.Tensor]) – A list of encoder input id tensors for encoder-decoder models (optional). Each tensor is of shape (sequence_length, ).

  • encoder_input_features – (List[torch.Tensor]): A list of encoder input feature tensors for multimodal encoder-decoder models (optional). Each tensor is of shape (sequence_length, feature_dim).

  • encoder_output_lengths – (List[int]): A list of encoder output lengths (optional) if encoder output has different length from encoder input (due to convolution down-sampling, etc.)

  • sampling_config (SamplingConfig) – The sampling configuration to be used as base parametrization for the generation call. The passed **kwargs matching the sampling_config’s attributes will override them. If the sampling_config is not provided, a default will be used.

  • prompt_table (str or torch.Tensor) – The file path of prompt table (.npy format, exported by nemo_prompt_convert.py) or the prompt table itself.

  • prompt_tasks (str) – The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0).

  • input_token_extra_ids (List[List[int]]) – Input token extra ids for using p-tuning and KV Cache reuse together

  • lora_uids (list) – The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module.

  • streaming (bool) – Whether or not to use streaming mode for generation.

  • stopping_criteria (StoppingCriteria) – Custom stopping criteria.

  • logits_processor_names (List[str]) – Custom logits processor names.

  • return_all_generated_tokens (bool) – Whether the full output is returned at each streaming step

  • (Dict[str (kwargs) – Ad hoc parametrization of sampling_config. The passed **kwargs matching the sampling_config’s attributes will override them.

  • Any] – Ad hoc parametrization of sampling_config. The passed **kwargs matching the sampling_config’s attributes will override them.

Returns:

If return_dict=False, the method returns generated output_ids. If return_dict=True, the method returns a dict of output_ids, sequence_lengths (if sampling_config.output_sequence_lengths=True), context_logits and generation_logits (if self.gather_context_logits=True and self.gather_generation_logits=True, respectively).

Return type:

torch.Tensor or dict

property hidden_size: int
property max_prompt_embedding_table_size: int
property max_sequence_length: int
property num_heads: int
property num_layers: int
property remove_input_padding: bool
property vocab_size: int
property vocab_size_padded: int
class tensorrt_llm.runtime.MultimodalModelRunner(args)[source]

Bases: object

generate(pre_prompt, post_prompt, image, decoder_input_ids, max_new_tokens, warmup=False, other_vision_inputs={}, other_decoder_inputs={})[source]
get_rope_index(input_ids: LongTensor, image_grid_thw: LongTensor | None = None, video_grid_thw: LongTensor | None = None, attention_mask: Tensor | None = None) Tuple[Tensor, Tensor][source]

Calculate the 3D rope index based on image and video’s temporal, height and width in LLM.

Explanation:

Each embedding sequence contains vision embedding and text embedding or just contains text embedding.

For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs. Examples:

input_ids: [T T T T T], here T is for text. temporal position_ids: [0, 1, 2, 3, 4] height position_ids: [0, 1, 2, 3, 4] width position_ids: [0, 1, 2, 3, 4]

For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part and 1D rotary position embeddin for text part. Examples:

Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches. input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision. vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] text temporal position_ids: [3, 4, 5, 6, 7] text height position_ids: [3, 4, 5, 6, 7] text width position_ids: [3, 4, 5, 6, 7] Here we calculate the text start position_ids as the max vision position_ids plus 1.

Parameters:
  • input_ids (torch.LongTensor of shape (batch_size, sequence_length)) – Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.

  • image_grid_thw (torch.LongTensor of shape (num_images, 3), optional) – The temporal, height and width of feature shape of each image in LLM.

  • video_grid_thw (torch.LongTensor of shape (num_videos, 3), optional) – The temporal, height and width of feature shape of each video in LLM.

  • attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional) –

    Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]:

    • 1 for tokens that are not masked,

    • 0 for tokens that are masked.

Returns:

position_ids (torch.LongTensor of shape (3, batch_size, sequence_length)) mrope_position_deltas (torch.Tensor of shape (batch_size))

get_visual_features(image, other_vision_inputs)[source]
init_image_encoder()[source]
init_llm()[source]
init_processor()[source]
init_tokenizer()[source]
load_test_image()[source]
prepare_position_ids_for_cogvlm(input_ids)[source]
preprocess(warmup, pre_prompt, post_prompt, image, other_vision_inputs)[source]
ptuning_setup(prompt_table, input_ids, input_lengths)[source]
ptuning_setup_fuyu(input_ids, image_patches_indices)[source]
ptuning_setup_llava_next(visual_features, pre_prompt, post_prompt)[source]
ptuning_setup_phi3(visual_features, input_ids, num_img_tokens)[source]
run(input_text, input_image, max_new_tokens)[source]
setup_fake_prompts(visual_features, pre_input_ids, post_input_ids, input_lengths)[source]
setup_fake_prompts_qwen2vl(visual_features, input_ids, vision_grid_thws, attention_mask, input_lengths)[source]
setup_fake_prompts_vila(batch_size, visual_features, split_input_ids, input_lengths)[source]
setup_inputs(input_text, raw_image)[source]
split_prompt_by_images(tensor)[source]
static tokenizer_image_token(batch_size, prompt, tokenizer, image_token_index=-200)[source]
video_preprocess(video_path)[source]
class tensorrt_llm.runtime.QWenForCausalLMGenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream = None, global_max_input_length: int = 2048, global_max_output_length: int = 4096)[source]

Bases: GenerationSession

generate(input_ids: Tensor, input_lengths: Tensor, sampling_config: SamplingConfig, max_new_tokens: int, runtime_rank: int = 0)[source]
class tensorrt_llm.runtime.SamplingConfig(end_id: int, pad_id: int, max_new_tokens: int = 20, num_beams: int = 1, num_return_sequences: int | None = None, max_attention_window_size: int | None = None, sink_token_length: int | None = None, output_sequence_lengths: bool = False, return_dict: bool = False, stop_words_list: list | numpy.ndarray | torch.Tensor | NoneType = None, bad_words_list: list | numpy.ndarray | torch.Tensor | NoneType = None, temperature: float | torch.Tensor = 1.0, top_k: int | torch.Tensor = 1, top_p: float | torch.Tensor = 0.0, top_p_decay: torch.Tensor | None = None, top_p_min: torch.Tensor | None = None, top_p_reset_ids: torch.Tensor | None = None, length_penalty: float | torch.Tensor = 1.0, early_stopping: int | torch.Tensor = 1, repetition_penalty: float | torch.Tensor = 1.0, min_length: int | torch.Tensor = 1, presence_penalty: float | torch.Tensor = 0.0, frequency_penalty: float | torch.Tensor = 0.0, use_beam_hyps: bool = True)[source]

Bases: object

bad_words_list: list | ndarray | Tensor | None = None
beam_search_diversity_rate: float | Tensor = 0.0
early_stopping: int | Tensor = 1
end_id: int
frequency_penalty: float | Tensor = 0.0
length_penalty: float | Tensor = 1.0
max_attention_window_size: int | None = None
max_new_tokens: int = 20
min_length: int | Tensor = 1
no_repeat_ngram_size: int | Tensor = None
num_beams: int = 1
num_return_sequences: int | None = None
output_cum_log_probs: bool = False
output_log_probs: bool = False
output_sequence_lengths: bool = False
pad_id: int
presence_penalty: float | Tensor = 0.0
random_seed: int | Tensor = None
repetition_penalty: float | Tensor = 1.0
return_dict: bool = False
sink_token_length: int | None = None
stop_words_list: list | ndarray | Tensor | None = None
temperature: float | Tensor = 1.0
top_k: int | Tensor = 1
top_p: float | Tensor = 0.0
top_p_decay: Tensor | None = None
top_p_min: Tensor | None = None
top_p_reset_ids: Tensor | None = None
update(**kwargs)[source]
use_beam_hyps: bool = True
class tensorrt_llm.runtime.Session(**kwargs)[source]

Bases: object

Session is a managed TensorRT runtime.

property context: IExecutionContext
Get the default TensorRT execution context,

use self.engine.create_execution_context() to create a new context if needed

@return: one TensorRT execution context object

Type:

@brief

property context_mem_size: int
property engine: ICudaEngine
static from_engine(engine) Session[source]

@brief: Create a session from an existing ICudaEngine engine @param engine: an ICudaEngine @return: a Session object

static from_serialized_engine(engine) Session[source]

@brief: Create a session from a serialized engine @param engine: a serialized engine @return: a Session object

infer_shapes(inputs: List[TensorInfo], context: IExecutionContext | None = None) List[TensorInfo][source]
@brief: Set input shapes to given context, and infer the output shapes from the given input shapes.

This function should be called every time when the input shapes are changed before calling run(). Or call the context.set_input_shape on all dynamic shaped input tensors manually.

@param inputs: list of TensorInfo object, each item represents an input tensor @param context: TensorRT execution context, if None, use the default context @return: list of TensorInfo object, each item represents an output tensor, returns None if failed

run(inputs: Dict[str, Any], outputs: Dict[str, Any], stream, context=None) bool[source]

@brief: Run the TensorRT engine with the given inputs and outputs @param inputs: dict of input tensors, key is tensor name, value is tensor pointer or torch tensor @param outputs: dict of output tensors, key is tensor name, value is tensor pointer or torch tensor @param stream: cuda stream to enqueue the TensorRT engine on @param context: TensorRT execution context, if None, use the default context @return: True if enqueue succeeded, note the enqueue is an async call,

returning True does not mean the execution is finished

property runtime: Runtime
set_shapes(tensor_dict: Dict[str, Tensor], context: IExecutionContext | None = None)[source]
class tensorrt_llm.runtime.StoppingCriteria[source]

Bases: object

Base class for all stopping criteria that can be applied during generation.

class tensorrt_llm.runtime.StoppingCriteriaList(iterable=(), /)[source]

Bases: list, StoppingCriteria

class tensorrt_llm.runtime.TensorInfo(name: 'str', dtype: 'trt.DataType', shape: 'tuple')[source]

Bases: object

dtype: DataType
name: str
shape: tuple
tensorrt_llm.runtime.decode_words_list(word_dict: List[List[str]], tokenizer=None, add_special_tokens=False)[source]
format of word_dict

len(word_dict) should be same to batch_size word_dict[i] means the words for batch i len(word_dict[i]) >= 1, which means it must contain at least 1 string For example, word_dict[2] = [” I am happy”, “ I am sad”].