Runtime

class tensorrt_llm.runtime.ChatGLMGenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream | None = None)[source]

Bases: GenerationSession

class tensorrt_llm.runtime.GenerationSequence(seq_idx, batch_idx)[source]

Bases: object

get_batch_idx() int[source]

Returns idx of sequence in batch

get_seq_idx() int[source]

Returns sequence idx

class tensorrt_llm.runtime.GenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream | None = None)[source]

Bases: object

batch_size: int
buffer_allocated: bool
property cross_attention
cuda_graph_mode: bool
cuda_stream_guard()[source]

Sync external stream and set current stream to the one bound to the session. Reset on exit.

debug_mode: bool
debug_tensors_to_save: None
decode(input_ids: Tensor, context_lengths: Tensor, sampling_config: SamplingConfig, prompt_embedding_table: Tensor = None, tasks: Tensor = None, prompt_vocab_size: Tensor = None, stop_words_list=None, bad_words_list=None, no_repeat_ngram_size=None, streaming: bool = False, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor = None, encoder_input_lengths: Tensor = None, stopping_criteria: StoppingCriteria = None, logits_processor: LogitsProcessor = None, cross_attention_mask: Tensor = None, **kwargs)[source]
decode_batch(input_ids: Sequence[Tensor], sampling_config: SamplingConfig, streaming: bool = False, **kwargs)[source]
decode_regular(batch_size: int, scfg: SamplingConfig, sequence_lengths: Tensor, context_lengths: Tensor, host_context_lengths, max_context_length: int, beam_width: int, cache_indirections: list, input_ids: Tensor, hidden_states: Tensor, prompt_embedding_table: Tensor, tasks: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, stop_words_data, bad_words_data, no_repeat_ngram_size, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor | None = None, encoder_input_lengths: Tensor | None = None, stopping_criteria: StoppingCriteria | None = None, logits_processor: LogitsProcessor | None = None, cross_attention_mask: Tensor | None = None, **kwargs)[source]
decode_stream(batch_size: int, scfg: SamplingConfig, sequence_lengths: Tensor, context_lengths: Tensor, host_context_lengths, max_context_length: int, beam_width: int, cache_indirections: list, input_ids: Tensor, hidden_states: Tensor, prompt_embedding_table: Tensor, tasks: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, stop_words_data, bad_words_data, no_repeat_ngram_size, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor | None = None, encoder_input_lengths: Tensor | None = None, stopping_criteria: StoppingCriteria | None = None, logits_processor: LogitsProcessor | None = None, cross_attention_mask: Tensor | None = None, **kwargs)[source]
device: device
property dtype
dump_debug_buffers(step: int) None[source]
early_stop_criteria(batch_size, step, should_stop)[source]
filter_medusa_logits(batch_size, best_path, best_path_lengths, medusa_logits)[source]

medusa_logits is of shape [nMH, bs, nMT+1, vocab]

Returns [nMH, bs, vocab]

finalize_decoder(context_lengths, batch_size, beam_width, scfg, in_progress=False)[source]
find_best_medusa_path(batch_size, input_ids: Tensor, next_logits, temp=0)[source]
property first_layer
property gather_context_logits
property gather_generation_logits
get_next_medusa_tokens(batch_size, next_medusa_logits)[source]
handle_per_step(cache_indirections: list, step: int, batch_size: int, max_context_length: int, beam_width: int, input_ids: Tensor, hidden_states: Tensor, scfg: SamplingConfig, kv_cache_block_pointers: list, host_kv_cache_block_pointers: list, prompt_embedding_table: Tensor, tasks: Tensor, context_lengths: Tensor, host_context_lengths, attention_mask: Tensor, cross_attention_mask: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, sequence_lengths: Tensor, next_step_tensors: Dict[str, RuntimeTensor], stop_words_data, bad_words_data, no_repeat_ngram_size, encoder_output: Tensor, encoder_input_lengths: Tensor, stopping_criteria: StoppingCriteria, logits_processor: LogitsProcessor, **kwargs)[source]
property has_position_embedding
property has_token_type_embedding
property head_size
property hidden_size
property is_medusa_mode
property last_layer
mapping: Mapping
property max_medusa_tokens
property max_prompt_embedding_table_size
medusa_paths: List[List[int]] = None
medusa_position_offsets: List[int] = None
medusa_temperature: float = 0.0
medusa_topks: List[int] = None
medusa_tree_ids: List[int] = None
next_medusa_input_ids()[source]
property num_heads
property num_heads_kv
property num_layers
property num_medusa_heads
num_medusa_tokens: int = 0
property paged_kv_cache
property paged_state
pp_communicate_final_output_ids(final_output_ids, batch_size, beam_width)[source]
pp_communicate_new_tokens(should_stop, cache_indir, sequence_length)[source]
process_logits_for_medusa_mode(step, batch_size, input_ids, logits, context_has_medusa_tokens, next_step_buffer, context_lengths)[source]
property quant_mode
property remove_input_padding
runtime: _Runtime
setup(batch_size: int, max_context_length: int, max_new_tokens: int, beam_width: int = 1, max_attention_window_size: int | None = None, sink_token_length: int | None = None, encoder_max_input_length: int | None = None, lora_manager: LoraManager | None = None, lora_uids: List[str] | None = None, medusa_choices: List[List[int]] | None = None)[source]
property tokens_per_block
update_kv_cache_draft_token_location(batch_size, best_path, best_path_len)[source]
update_output_ids_by_offset(new_generated_ids, offsets)[source]
property use_context_fmha_for_generation
property use_custom_all_reduce
property use_gpt_attention_plugin
property use_lora_plugin
property use_mamba_conv1d_plugin
property vocab_size
class tensorrt_llm.runtime.KVCacheManager(memory_pools: List[Tensor], blocks: int, tokens_per_block: int, max_blocks_per_seq: int, max_attention_window_size: int, sink_token_len: int, beam_width: int = 1, use_one_more_block: bool = False)[source]

Bases: object

add_sequence(sequence: GenerationSequence, context_len: int)[source]

Add sequence to the manager and allocate minimum amount of blocks for context

get_block_pointers(beam_width: int) Tensor[source]

Returns arrays of pointers for all memory pools

step(finished: List[bool])[source]

Iterate to the next generation step. Add new blocks where needed and clear finished sequences.

class tensorrt_llm.runtime.LogitsProcessor[source]

Bases: object

Base class for all logit processors that can be applied during generation.

class tensorrt_llm.runtime.LogitsProcessorList(iterable=(), /)[source]

Bases: list, LogitsProcessor

class tensorrt_llm.runtime.MambaLMHeadModelGenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream | None = None)[source]

Bases: GenerationSession

property mamba_d_conv
property mamba_d_state
property mamba_expand
setup(batch_size: int, max_context_length: int, max_new_tokens: int, beam_width: int = 1, max_attention_window_size: int | None = None, sink_token_length: int | None = None, encoder_max_input_length: int | None = None, lora_manager: LoraManager | None = None, lora_uids: List[str] | None = None, medusa_choices: List[List[int]] | None = None)[source]
class tensorrt_llm.runtime.ModelConfig(max_batch_size: int, max_beam_width: int, vocab_size: int, num_layers: int, num_heads: int, num_kv_heads: int, hidden_size: int, gpt_attention_plugin: bool, remove_input_padding: bool = False, model_name: str = '', paged_kv_cache: bool = False, cross_attention: bool = False, head_size: int = None, has_position_embedding: bool = True, has_token_type_embedding: bool = False, tokens_per_block: int = 64, max_prompt_embedding_table_size: int = 0, quant_mode: tensorrt_llm.quantization.mode.QuantMode = <QuantMode.0: 0>, gather_context_logits: bool = False, gather_generation_logits: bool = False, dtype: str = '', use_custom_all_reduce: bool = False, lora_plugin: bool = False, lora_target_modules: List[str] = <factory>, use_context_fmha_for_generation: bool = False, trtllm_modules_to_hf_modules: dict = None, skip_cross_qkv: bool = False, num_medusa_heads: int = 0, max_medusa_tokens: int = 0, mamba_d_state: int = 0, mamba_d_conv: int = 0, mamba_expand: int = 0, paged_state: bool = True, mamba_conv1d_plugin: bool = True)[source]

Bases: object

cross_attention: bool = False
dtype: str = ''
gather_context_logits: bool = False
gather_generation_logits: bool = False
gpt_attention_plugin: bool
has_position_embedding: bool = True
has_token_type_embedding: bool = False
head_size: int = None
hidden_size: int
lora_plugin: bool = False
lora_target_modules: List[str]
mamba_conv1d_plugin: bool = True
mamba_d_conv: int = 0
mamba_d_state: int = 0
mamba_expand: int = 0
max_batch_size: int
max_beam_width: int
max_medusa_tokens: int = 0
max_prompt_embedding_table_size: int = 0
model_name: str = ''
num_heads: int
num_kv_heads: int
num_layers: int
num_medusa_heads: int = 0
paged_kv_cache: bool = False
paged_state: bool = True
quant_mode: QuantMode = 0
remove_input_padding: bool = False
skip_cross_qkv: bool = False
tokens_per_block: int = 64
trtllm_modules_to_hf_modules: dict = None
use_context_fmha_for_generation: bool = False
use_custom_all_reduce: bool = False
vocab_size: int
class tensorrt_llm.runtime.ModelRunner(session: GenerationSession, max_batch_size: int, max_input_len: int, max_seq_len: int, max_beam_width: int, lora_manager: LoraManager | None = None)[source]

Bases: ModelRunnerMixin

An interface class that wraps GenerationSession and provides generation methods.

property dtype: dtype
classmethod from_dir(engine_dir: str, lora_dir: List[str] | None = None, rank: int = 0, debug_mode: bool = False, lora_ckpt_source: str = 'hf', medusa_choices: List[List[int]] | None = None, stream: Stream | None = None) ModelRunner[source]

Create a ModelRunner instance from an engine directory.

Parameters:
  • engine_dir (str) – The directory that contains the serialized engine files and config files.

  • lora_dir (Optional[List[str]]) – The directories that contain LoRA weights.

  • rank (int) – The runtime rank id.

  • debug_mode (bool) – Whether or not to turn on the debug mode.

  • medusa_choices (List[List[int]]) – Medusa choices to use when in Medusa decoding

  • stream (torch.cuda.Stream) – Stream to use.

Returns:

An instance of ModelRunner.

Return type:

ModelRunner

classmethod from_engine(engine: Engine, lora_dir: List[str] | None = None, rank: int = 0, debug_mode: bool = False, lora_ckpt_source: str = 'hf', medusa_choices: List[List[int]] | None = None, stream: Stream | None = None) ModelRunner[source]
property gather_context_logits: bool
property gather_generation_logits: bool
generate(batch_input_ids: List[Tensor], sampling_config: SamplingConfig | None = None, prompt_table_path: str | None = None, prompt_tasks: str | None = None, lora_uids: list | None = None, streaming: bool = False, stopping_criteria: StoppingCriteria | None = None, logits_processor: LogitsProcessor | None = None, medusa_choices: List[List[int]] | None = None, **kwargs) Tensor | dict[source]

Generates sequences of token ids. The generation-controlling parameters are set in the sampling_config; it will be set to a default one if not passed. You can override any sampling_config’s attributes by passing corresponding parameters.

Parameters:
  • batch_input_ids (List[torch.Tensor]) – A list of input id tensors. Each tensor is of shape (sequence_length, ).

  • sampling_config (SamplingConfig) – The sampling configuration to be used as base parametrization for the generation call. The passed **kwargs matching the sampling_config’s attributes will override them. If the sampling_config is not provided, a default will be used.

  • prompt_table_path (str) – The file path of prompt table (.npy format, exported by nemo_prompt_convert.py).

  • prompt_tasks (str) – The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0).

  • lora_uids (list) – The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module.

  • streaming (bool) – Whether or not to use streaming mode for generation.

  • stopping_criteria (StoppingCriteria) – Custom stopping criteria.

  • logits_processor (LogitsProcessor) – Custom logits processors.

  • medusa_choices (List[List[int]]) – Medusa decoding choices.

  • (Dict[str (kwargs) – Ad hoc parametrization of sampling_config. The passed **kwargs matching the sampling_config’s attributes will override them.

  • Any] – Ad hoc parametrization of sampling_config. The passed **kwargs matching the sampling_config’s attributes will override them.

Returns:

If return_dict=False, the method returns generated output_ids. If return_dict=True, the method returns a dict of output_ids, sequence_lengths (if sampling_config.output_sequence_lengths=True), context_logits and generation_logits (if self.gather_context_logits=True and self.gather_generation_logits=True, respectively).

Return type:

torch.Tensor or dict

property hidden_size: int
property max_prompt_embedding_table_size: int
property max_sequence_length: int
property num_heads: int
property num_layers: int
property remove_input_padding: bool
serialize_engine() IHostMemory[source]

Serialize the engine.

Returns:

The serialized engine.

Return type:

bytes

property use_lora_plugin: bool
property vocab_size: int
property vocab_size_padded: int
class tensorrt_llm.runtime.ModelRunnerCpp(session: GptSession, max_batch_size: int, max_input_len: int, max_seq_len: int, max_beam_width: int, lora_manager: LoraManager | None = None)[source]

Bases: ModelRunnerMixin

An interface class that wraps GptSession and provides generation methods.

property dtype: dtype
classmethod from_dir(engine_dir: str, lora_dir: str | None = None, rank: int = 0, max_batch_size: int | None = None, max_input_len: int | None = None, max_output_len: int | None = None, max_beam_width: int | None = None, max_attention_window_size: int | None = None, sink_token_length: int | None = None, free_gpu_memory_fraction: float | None = None, debug_mode: bool = False, lora_ckpt_source: str = 'hf') ModelRunnerCpp[source]

Create a ModelRunnerCpp instance from an engine directory.

Parameters:
  • engine_dir (str) – The directory that contains the serialized engine files and config files.

  • lora_dir (str) – The directory that contains LoRA weights.

  • rank (int) – The runtime rank id.

  • max_batch_size (int) – The runtime batch size limit. If max_batch_size is not None, it should not be larger than the engine’s max_batch_size; otherwise, the engine’s max_batch_size will be used.

  • max_input_len (int) – The runtime input length limit. If max_input_len is not None, it should not be larger than the engine’s max_input_len; otherwise, the engine’s max_input_len will be used.

  • max_output_len (int) – The runtime output length limit. If max_output_len is not None, it should not be larger than the engine’s max_output_len; otherwise, the engine’s max_output_len will be used.

  • max_beam_width (int) – The runtime beam width limit. If max_beam_width is not None, it should not be larger than the engine’s max_beam_width; otherwise, the engine’s max_beam_width will be used.

  • max_attention_window_size (int) – The attention window size that controls the sliding window attention / cyclic kv cache behavior.

  • sink_token_length (int) – The sink token length, default=0.

  • free_gpu_memory_fraction (float) – Free GPU memory fraction that KV cache used.

  • debug_mode (bool) – Whether or not to turn on the debug mode.

  • lora_ckpt_source (str) – Source of checkpoint. Should be one of [‘hf’, ‘nemo’].

Returns:

An instance of ModelRunnerCpp.

Return type:

ModelRunnerCpp

property gather_context_logits: bool
property gather_generation_logits: bool
generate(batch_input_ids: List[Tensor], sampling_config: SamplingConfig | None = None, prompt_table_path: str | None = None, prompt_tasks: str | None = None, lora_uids: list | None = None, streaming: bool = False, stopping_criteria: StoppingCriteria | None = None, logits_processor: LogitsProcessor | None = None, **kwargs) Tensor | dict[source]

Generates sequences of token ids. The generation-controlling parameters are set in the sampling_config; it will be set to a default one if not passed. You can override any sampling_config’s attributes by passing corresponding parameters.

Parameters:
  • batch_input_ids (List[torch.Tensor]) – A list of input id tensors. Each tensor is of shape (sequence_length, ).

  • sampling_config (SamplingConfig) – The sampling configuration to be used as base parametrization for the generation call. The passed **kwargs matching the sampling_config’s attributes will override them. If the sampling_config is not provided, a default will be used.

  • prompt_table_path (str) – The file path of prompt table (.npy format, exported by nemo_prompt_convert.py).

  • prompt_tasks (str) – The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0).

  • lora_uids (list) – The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module.

  • streaming (bool) – Whether or not to use streaming mode for generation.

  • stopping_criteria (StoppingCriteria) – Custom stopping criteria.

  • logits_processor (LogitsProcessor) – Custom logits processors.

  • (Dict[str (kwargs) – Ad hoc parametrization of sampling_config. The passed **kwargs matching the sampling_config’s attributes will override them.

  • Any] – Ad hoc parametrization of sampling_config. The passed **kwargs matching the sampling_config’s attributes will override them.

Returns:

If return_dict=False, the method returns generated output_ids. If return_dict=True, the method returns a dict of output_ids, sequence_lengths (if sampling_config.output_sequence_lengths=True), context_logits and generation_logits (if self.gather_context_logits=True and self.gather_generation_logits=True, respectively).

Return type:

torch.Tensor or dict

property hidden_size: int
property max_prompt_embedding_table_size: int
property max_sequence_length: int
property num_heads: int
property num_layers: int
property remove_input_padding: bool
property vocab_size: int
property vocab_size_padded: int
class tensorrt_llm.runtime.QWenForCausalLMGenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream | None = None, global_max_input_length: int = 2048, global_max_output_length: int = 4096)[source]

Bases: GenerationSession

generate(input_ids: Tensor, input_lengths: Tensor, sampling_config: SamplingConfig, max_new_tokens: int, runtime_rank: int = 0)[source]
class tensorrt_llm.runtime.SamplingConfig(end_id: int, pad_id: int, max_new_tokens: int = 20, num_beams: int = 1, max_attention_window_size: int | None = None, sink_token_length: int | None = None, output_sequence_lengths: bool = False, return_dict: bool = False, stop_words_list: torch.Tensor | None = None, bad_words_list: torch.Tensor | None = None, temperature: float | torch.Tensor = 1.0, top_k: int | torch.Tensor = 1, top_p: float | torch.Tensor = 0.0, top_p_decay: torch.Tensor | None = None, top_p_min: torch.Tensor | None = None, top_p_reset_ids: torch.Tensor | None = None, length_penalty: float | torch.Tensor = 1.0, early_stopping: int | torch.Tensor = 1, repetition_penalty: float | torch.Tensor = 1.0, min_length: int | torch.Tensor = 1, presence_penalty: float | torch.Tensor = 0.0, frequency_penalty: float | torch.Tensor = 0.0, use_beam_hyps: bool = True)[source]

Bases: object

bad_words_list: Tensor | None = None
beam_search_diversity_rate: float | Tensor = 0.0
early_stopping: int | Tensor = 1
end_id: int
frequency_penalty: float | Tensor = 0.0
length_penalty: float | Tensor = 1.0
max_attention_window_size: int | None = None
max_new_tokens: int = 20
min_length: int | Tensor = 1
num_beams: int = 1
output_cum_log_probs: bool = False
output_log_probs: bool = False
output_sequence_lengths: bool = False
pad_id: int
presence_penalty: float | Tensor = 0.0
random_seed: int | Tensor = None
repetition_penalty: float | Tensor = 1.0
return_dict: bool = False
sink_token_length: int | None = None
stop_words_list: Tensor | None = None
temperature: float | Tensor = 1.0
top_k: int | Tensor = 1
top_p: float | Tensor = 0.0
top_p_decay: Tensor | None = None
top_p_min: Tensor | None = None
top_p_reset_ids: Tensor | None = None
update(**kwargs)[source]
use_beam_hyps: bool = True
class tensorrt_llm.runtime.Session(**kwargs)[source]

Bases: object

Session is a managed TensorRT runtime.

property context: IExecutionContext
Get the default TensorRT execution context,

use self.engine.create_execution_context() to create a new context if needed

@return: one TensorRT execution context object

Type:

@brief

property engine: ICudaEngine
static from_engine(engine) Session[source]

@brief: Create a session from an existing ICudaEngine engine @param engine: an ICudaEngine @return: a Session object

static from_serialized_engine(engine) Session[source]

@brief: Create a session from a serialized engine @param engine: a serialized engine @return: a Session object

infer_shapes(inputs: List[TensorInfo], context: IExecutionContext | None = None) List[TensorInfo][source]
@brief: Set input shapes to given context, and infer the output shapes from the given input shapes.

This function should be called every time when the input shapes are changed before calling run(). Or call the context.set_input_shape on all dynamic shaped input tensors manually.

@param inputs: list of TensorInfo object, each item represents an input tensor @param context: TensorRT execution context, if None, use the default context @return: list of TensorInfo object, each item represents an output tensor, returns None if failed

run(inputs: Dict[str, Any], outputs: Dict[str, Any], stream, context=None) bool[source]

@brief: Run the TensorRT engine with the given inputs and outputs @param inputs: dict of input tensors, key is tensor name, value is tensor pointer or torch tensor @param outputs: dict of output tensors, key is tensor name, value is tensor pointer or torch tensor @param stream: cuda stream to enqueue the TensorRT engine on @param context: TensorRT execution context, if None, use the default context @return: True if enqueue succeeded, note the enqueue is an async call,

returning True does not mean the execution is finished

property runtime: Runtime
set_shapes(tensor_dict: Dict[str, Tensor], context: IExecutionContext | None = None)[source]
class tensorrt_llm.runtime.StoppingCriteria[source]

Bases: object

Base class for all stopping criteria that can be applied during generation.

class tensorrt_llm.runtime.StoppingCriteriaList(iterable=(), /)[source]

Bases: list, StoppingCriteria

class tensorrt_llm.runtime.TensorInfo(name: 'str', dtype: 'trt.DataType', shape: 'tuple')[source]

Bases: object

dtype: DataType
name: str
shape: tuple
tensorrt_llm.runtime.to_word_list_format(word_dict: List[List[str]], tokenizer=None, add_special_tokens=False)[source]
format of word_dict

len(word_dict) should be same to batch_size word_dict[i] means the words for batch i len(word_dict[i]) must be 1, which means it only contains 1 string This string can contains several sentences and split by “,”. For example, if word_dict[2] = “ I am happy, I am sad”, then this function will return the ids for two short sentences “ I am happy” and “ I am sad”.