Models

class tensorrt_llm.models.BaichuanForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

class tensorrt_llm.models.BertForQuestionAnswering(num_layers, num_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, type_vocab_size, pad_token_id=None, is_roberta=False, num_labels=2, mapping=<tensorrt_llm.mapping.Mapping object>, dtype=None)[source]

Bases: Module

forward(input_ids=None, input_lengths=None, token_type_ids=None, position_ids=None, hidden_states=None)[source]
class tensorrt_llm.models.BertForSequenceClassification(num_layers, num_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, type_vocab_size, pad_token_id=None, is_roberta=False, num_labels=2, mapping=<tensorrt_llm.mapping.Mapping object>, dtype=None)[source]

Bases: Module

forward(input_ids=None, input_lengths=None, token_type_ids=None, position_ids=None, hidden_states=None)[source]
class tensorrt_llm.models.BertModel(num_layers, num_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, type_vocab_size, pad_token_id=None, is_roberta=False, mapping=<tensorrt_llm.mapping.Mapping object>, dtype=None)[source]

Bases: Module

forward(input_ids=None, input_lengths=None, position_ids=None, token_type_ids=None, hidden_states=None)[source]
class tensorrt_llm.models.BloomForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

class tensorrt_llm.models.BloomModel(config: PretrainedConfig)[source]

Bases: Module

forward(input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, attention_params=None)[source]
class tensorrt_llm.models.ChatGLMForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

check_config(config: PretrainedConfig)[source]
prepare_inputs(*args, **kwargs)[source]

See PretrainedModel.prepare_inputs for the detailed parameter list.

class tensorrt_llm.models.ChatGLMModel(config: PretrainedConfig)[source]

Bases: Module

forward(input_ids: Tensor | None = None, position_ids: Tensor | None = None, use_cache: bool = False, attention_mask: Tensor | None = None, kv_cache_params: KeyValueCacheParams | None = None, attention_params: AttentionParams | None = None)[source]
class tensorrt_llm.models.DecoderModel(num_layers, num_heads, hidden_size, ffn_hidden_size, encoder_num_heads, encoder_hidden_size, vocab_size, dtype, logits_dtype='float32', head_size=None, encoder_head_size=None, num_kv_heads=None, encoder_num_kv_heads=None, max_position_embeddings=None, has_position_embedding=False, relative_attention=False, max_distance=None, num_buckets=None, type_vocab_size=None, has_embedding_layernorm=False, has_embedding_scale=False, q_scaling=1.0, has_attention_qkvo_bias=False, has_mlp_bias=False, has_model_final_layernorm=False, layernorm_eps=1e-05, layernorm_position=LayerNormPositionType.pre_layernorm, layernorm_type=LayerNormType.LayerNorm, hidden_act='relu', mlp_type=MLPType.MLP, rescale_before_lm_head=False, has_lm_head_bias=False, residual_scaling=1.0, use_parallel_embedding=False, embedding_sharding_dim=0, mapping=<tensorrt_llm.mapping.Mapping object>, fp16_clamping=False, max_lora_rank=None, skip_cross_qkv=False)[source]

Bases: Module, GenerationMixin

forward(decoder_input_ids: Tensor, encoder_output: Tensor, position_ids=None, token_type_ids=None, use_cache=False, attention_mask=None, cross_attention_mask=None, last_token_ids=None, kv_cache_params=None, attention_params=None, hidden_states=None, lora_params: LoraParams | None = None, cross_kv_cache_gen: Tensor | None = None, cross_qkv_reuse: Tensor | None = None)[source]
prepare_inputs(max_batch_size, max_beam_width, max_decoder_input_len, max_new_tokens, max_encoder_input_len, gather_context_logits: bool = False, gather_generation_logits: bool = False, lora_target_modules: List[str] | None = None)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.EncoderModel(num_layers, num_heads, hidden_size, ffn_hidden_size, vocab_size, dtype, head_size=None, num_kv_heads=None, max_position_embeddings=None, has_position_embedding=False, relative_attention=False, max_distance=None, num_buckets=None, type_vocab_size=None, has_embedding_layernorm=False, has_embedding_scale=False, q_scaling=1.0, has_attention_qkvo_bias=False, has_mlp_bias=False, has_model_final_layernorm=False, layernorm_eps=1e-05, layernorm_position=LayerNormPositionType.pre_layernorm, layernorm_type=LayerNormType.LayerNorm, hidden_act='relu', mlp_type=MLPType.MLP, residual_scaling=1.0, use_prompt_tuning=False, use_parallel_embedding=False, embedding_sharding_dim=0, mapping=<tensorrt_llm.mapping.Mapping object>, fp16_clamping=False, max_lora_rank=None)[source]

Bases: Module, GenerationMixin

forward(input_ids: Tensor, input_lengths=None, position_ids=None, token_type_ids=None, hidden_states=None, max_input_length=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, attention_mask=None, lora_params: LoraParams | None = None)[source]
prepare_inputs(max_batch_size, max_input_len, prompt_embedding_table_size: int = 0, lora_target_modules: List[str] | None = None)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.FalconForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

check_config(config)[source]
class tensorrt_llm.models.FalconModel(config: PretrainedConfig)[source]

Bases: Module

forward(input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, hidden_states=None)[source]
class tensorrt_llm.models.GPTForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

check_config(config: PretrainedConfig)[source]
use_lora(lora_config: LoraBuildConfig)[source]

Load lora weights from the give config to the module :param lora_config: the lora config

class tensorrt_llm.models.GPTJForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

check_config(config)[source]
class tensorrt_llm.models.GPTJModel(config: PretrainedConfig)[source]

Bases: Module

forward(input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None)[source]
class tensorrt_llm.models.GPTModel(config: PretrainedConfig)[source]

Bases: Module

forward(input_ids, position_ids, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, lora_params=None)[source]
class tensorrt_llm.models.GPTNeoXForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

class tensorrt_llm.models.GPTNeoXModel(config: PretrainedConfig)[source]

Bases: Module

forward(input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None)[source]
class tensorrt_llm.models.GemmaForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM, TopModelMixin

check_config(config)[source]
classmethod from_hugging_face(hf_model_dir, dtype='float16', mapping: Mapping | None = None, **kwargs)[source]

Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

class tensorrt_llm.models.LLaMAForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

check_config(config)[source]
default_plugin_config(**kwargs)[source]

Return the default plugin config for this model, when the plugin_config value is not given in to_trt() call. If users need to set different plugin configs, they can start from the return object and change it.

classmethod from_hugging_face(hf_model_dir, dtype='float16', mapping: Mapping | None = None, **kwargs)[source]

Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

classmethod from_meta_ckpt(meta_ckpt_dir, dtype, mapping, use_parallel_embedding: bool | None = False, embedding_sharding_dim: int | None = 0)[source]
classmethod quantize(hf_model_dir, output_dir, quant_config: QuantConfig, *, dtype='float16', mapping: Mapping | None = None, calib_batches=512, calib_batch_size=1, random_seed=1234, tokenizer_max_seq_length=2048, **kwargs)[source]
use_lora(lora_config: LoraBuildConfig)[source]

Load lora weights from the give config to the module :param lora_config: the lora config

class tensorrt_llm.models.LLaMAModel(config: PretrainedConfig)[source]

Bases: Module

forward(input_ids, position_ids=None, use_cache=False, attention_mask=None, medusa_position_offsets=None, medusa_packed_mask=None, kv_cache_params=None, attention_params=None, hidden_states=None, prompt_embedding_table: Tensor | None = None, prompt_tasks: Tensor | None = None, prompt_vocab_size: Tensor | None = None, lora_params=None)[source]
class tensorrt_llm.models.MPTForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

check_config(config)[source]
class tensorrt_llm.models.MPTModel(config: PretrainedConfig)[source]

Bases: Module

forward(input_ids, position_ids, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None)[source]
class tensorrt_llm.models.MambaLMHeadModel(*args, **kwargs)[source]

Bases: PretrainedModel

forward(input_ids, conv_states, ssm_states, host_request_types, host_context_lengths, last_token_ids, slot_mapping: Tensor | None = None)[source]
prepare_inputs(max_batch_size, max_input_len, max_seq_len, use_cache, max_beam_width: int = 1, max_num_tokens: int | None = None, opt_num_tokens: int | None = None, prompt_embedding_table_size: int = 0, max_draft_len: int = 0, gather_context_logits: bool = False, gather_generation_logits: bool = False, lora_target_modules: List[str] | None = None)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.MedusaForCausalLm(*args, **kwargs)[source]

Bases: LLaMAForCausalLM

forward(*args, **kwargs)[source]
prepare_inputs(*args, **kwargs)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.OPTForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

check_config(config)[source]
class tensorrt_llm.models.OPTModel(config: PretrainedConfig)[source]

Bases: Module

forward(input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None)[source]
class tensorrt_llm.models.PhiForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM, TopModelMixin

check_config(config)[source]
classmethod convert_hf_checkpoint(hf_model_dir: str, dtype: str | None = 'float16', output_dir: str | None = None, **kwargs)[source]

Convert Huggingface checkpoint to TRT-LLM checkpoint

class tensorrt_llm.models.PhiModel(config: PretrainedConfig)[source]

Bases: Module

forward(input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None)[source]
class tensorrt_llm.models.PretrainedConfig(architecture: str, dtype: str, logits_dtype: str, vocab_size: int, max_position_embeddings: int, hidden_size: int, num_hidden_layers: int, num_attention_heads: int, num_key_value_heads: int, hidden_act: str, intermediate_size: int, norm_epsilon: float, position_embedding_type: str, world_size: int, tp_size: int, pp_size: int, quantization: QuantConfig | dict, use_parallel_embedding: bool = False, embedding_sharding_dim: int = 0, share_embedding_table: bool = False, head_size: int | None = None, **kwargs)[source]

Bases: object

classmethod from_dict(config)[source]
classmethod from_json_file(config_file: str)[source]
property quant_mode
set_if_not_exist(key, value)[source]
set_rank(rank)[source]
to_dict()[source]
class tensorrt_llm.models.PretrainedModel(*args, **kwargs)[source]

Bases: Module, GenerationMixin, TopModelMixin

check_config(config)[source]
classmethod from_checkpoint(ckpt_dir: str, rank: int = 0, config: PretrainedConfig | None = None)[source]
classmethod from_config(config: PretrainedConfig)[source]
load(weights)[source]
load_partial_weights(weights: dict)[source]
prepare_inputs(max_batch_size, max_input_len, max_seq_len, use_cache, max_beam_width: int = 1, max_num_tokens: int | None = None, opt_num_tokens: int | None = None, prompt_embedding_table_size: int = 0, position_encoding_2d: bool = False, max_draft_len: int = 0, gather_context_logits: bool = False, gather_generation_logits: bool = False, lora_target_modules: List[str] | None = None)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

classmethod quantize(hf_model_dir, output_dir, quant_config: QuantConfig, *, dtype='float16', mapping: Mapping | None = None, calib_batches=512, calib_batch_size=1, random_seed=1234, tokenizer_max_seq_length=2048)[source]
release()[source]
save_checkpoint(output_dir, save_config=True)[source]
class tensorrt_llm.models.QWenForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

check_config(config)[source]
class tensorrt_llm.models.WhisperEncoder(n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int, dtype)[source]

Bases: Module

forward(x: Tensor, input_lengths=None)[source]
prepare_inputs(max_batch_size=16)[source]
tensorrt_llm.models.quantize_model(model: Module, quant_mode: QuantMode, **kwargs: Any)[source]