Models

class tensorrt_llm.models.BaichuanForCausalLM(num_layers, num_heads, num_kv_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, position_embedding_type, dtype, logits_dtype='float32', mlp_hidden_size=None, mapping=<tensorrt_llm.mapping.Mapping object>, quant_mode=QuantMode.None)[source]

Bases: BaichuanModel, GenerationMixin

forward(input_ids: Tensor, position_ids=None, use_cache=False, last_token_ids=None, attention_mask=None, kv_cache_params=None, attention_params=None)[source]
prepare_inputs(max_batch_size, max_input_len, max_new_tokens, use_cache, max_beam_width, max_num_tokens: int | None = None)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.BertForQuestionAnswering(num_layers, num_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, type_vocab_size, num_labels=2, mapping=<tensorrt_llm.mapping.Mapping object>, dtype=None)[source]

Bases: Module

forward(input_ids=None, input_lengths=None, token_type_ids=None, position_ids=None, hidden_states=None)[source]
class tensorrt_llm.models.BertModel(num_layers, num_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, type_vocab_size, mapping=<tensorrt_llm.mapping.Mapping object>, dtype=None)[source]

Bases: Module

forward(input_ids=None, input_lengths=None, token_type_ids=None, position_ids=None, hidden_states=None)[source]
class tensorrt_llm.models.BloomForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

check_config()[source]
class tensorrt_llm.models.BloomModel(config)[source]

Bases: Module

forward(input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, attention_params=None)[source]
class tensorrt_llm.models.ChatGLMHeadModel(apply_query_key_layer_scaling: bool | None = None, apply_residual_connection_post_layernorm: bool | None = None, dtype: str | None = None, enable_debug_output: bool | None = None, ffn_hidden_size: int | None = None, hidden_act: str | None = None, hidden_size: int | None = None, linear_bias: bool | None = None, logits_dtype: str | None = None, mapping: Mapping | None = None, max_input_len: int | None = None, max_output_len: int | None = None, max_seq_length: int | None = None, model_name: str | None = None, norm_epsilon: float | None = None, num_heads: int | None = None, num_kv_heads: int | None = None, num_layers: int | None = None, qkv_bias: bool | None = None, quant_mode: QuantMode | None = None, rmsnorm: bool | None = None, rotary_embedding_scaling: float | None = None, tokens_per_block: int | None = None, use_cache: bool | None = None, vocab_size: int | None = None, max_position_embeddings: int | None = None)[source]

Bases: ChatGLMModel, GenerationMixin

forward(input_ids: Tensor | None = None, position_ids: Tensor | None = None, last_token_ids: Tensor | None = None, kv_cache_params: KeyValueCacheParams | None = None, attention_params: AttentionParams | None = None)[source]
prepare_inputs(max_batch_size: int = 0, max_input_len: int = 0, max_new_tokens: int = 0, use_cache: bool = True, max_beam_width: int = 1)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.ChatGLMModel(config)[source]

Bases: Module

forward(input_ids: Tensor | None = None, position_ids: Tensor | None = None, kv_cache_params: KeyValueCacheParams | None = None, attention_params: AttentionParams | None = None)[source]
class tensorrt_llm.models.DecoderModel(num_layers, num_heads, hidden_size, ffn_hidden_size, encoder_num_heads, encoder_hidden_size, vocab_size, dtype, logits_dtype='float32', head_size=None, encoder_head_size=None, num_kv_heads=None, encoder_num_kv_heads=None, max_position_embeddings=None, has_position_embedding=False, relative_attention=False, max_distance=None, num_buckets=None, type_vocab_size=None, has_embedding_layernorm=False, has_embedding_scale=False, q_scaling=1.0, has_attention_qkvo_bias=False, has_mlp_bias=False, has_model_final_layernorm=False, layernorm_eps=1e-05, layernorm_position=LayerNormPositionType.pre_layernorm, layernorm_type=LayerNormType.LayerNorm, hidden_act='relu', mlp_type=MLPType.MLP, rescale_before_lm_head=False, has_lm_head_bias=False, residual_scaling=1.0, use_parallel_embedding=False, embedding_sharding_dim=0, mapping=<tensorrt_llm.mapping.Mapping object>)[source]

Bases: Module, GenerationMixin

forward(decoder_input_ids: Tensor, encoder_output: Tensor, position_ids=None, token_type_ids=None, use_cache=False, attention_mask=None, last_token_ids=None, kv_cache_params=None, attention_params=None, hidden_states=None, all_reduce_workspace=None)[source]
prepare_inputs(max_batch_size, max_beam_width, max_decoder_input_len, max_new_tokens, max_encoder_input_len)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.EncoderModel(num_layers, num_heads, hidden_size, ffn_hidden_size, vocab_size, dtype, head_size=None, num_kv_heads=None, max_position_embeddings=None, has_position_embedding=False, relative_attention=False, max_distance=None, num_buckets=None, type_vocab_size=None, has_embedding_layernorm=False, has_embedding_scale=False, q_scaling=1.0, has_attention_qkvo_bias=False, has_mlp_bias=False, has_model_final_layernorm=False, layernorm_eps=1e-05, layernorm_position=LayerNormPositionType.pre_layernorm, layernorm_type=LayerNormType.LayerNorm, hidden_act='relu', mlp_type=MLPType.MLP, residual_scaling=1.0, use_parallel_embedding=False, embedding_sharding_dim=0, mapping=<tensorrt_llm.mapping.Mapping object>)[source]

Bases: Module, GenerationMixin

forward(input_ids: Tensor, input_lengths=None, position_ids=None, token_type_ids=None, hidden_states=None, all_reduce_workspace=None, max_input_length=None)[source]
prepare_inputs(max_batch_size, max_input_len)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.FalconForCausalLM(num_layers: int, num_heads: int, hidden_size: int, vocab_size: int, max_position_embeddings: int, hidden_act: str = 'gelu', dtype: str | ~tensorrt.tensorrt.DataType | None = None, num_kv_heads: int | None = None, mlp_hidden_size: int | None = None, bias: bool = True, quant_mode: ~tensorrt_llm.quantization.mode.QuantMode = QuantMode.None, use_alibi: bool = True, parallel_attention: bool = False, new_decoder_architecture: bool = False, logits_dtype: str | ~tensorrt.tensorrt.DataType = 'float32', mapping=<tensorrt_llm.mapping.Mapping object>)[source]

Bases: FalconModel, GenerationMixin

forward(input_ids: Tensor, position_ids=None, use_cache=False, last_token_ids=None, attention_mask=None, kv_cache_params=None, attention_params=None, hidden_states=None, all_reduce_workspace=None)[source]
prepare_inputs(max_batch_size: int, max_input_len: int, max_new_tokens: int, use_cache: bool, max_beam_width: int = 1, max_num_tokens: int | None = None)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.FalconModel(num_layers: int, num_heads: int, hidden_size: int, vocab_size: int, hidden_act: int, max_position_embeddings: int, dtype: str | ~tensorrt.tensorrt.DataType | None = None, mapping: ~tensorrt_llm.mapping.Mapping = <tensorrt_llm.mapping.Mapping object>, num_kv_heads: int | None = None, mlp_hidden_size: int | None = None, bias: bool = True, quant_mode: ~tensorrt_llm.quantization.mode.QuantMode = QuantMode.None, use_alibi: bool = True, parallel_attention: bool = False, new_decoder_architecture: bool = False)[source]

Bases: Module

forward(input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, hidden_states=None, all_reduce_workspace=None)[source]
class tensorrt_llm.models.GPTJForCausalLM(num_layers, num_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, rotary_dim, dtype, logits_dtype='float32', mapping=<tensorrt_llm.mapping.Mapping object>, quant_mode=QuantMode.None)[source]

Bases: GPTJModel, GenerationMixin

forward(input_ids: Tensor, position_ids=None, use_cache=False, last_token_ids=None, kv_cache_params=None, attention_params=None)[source]
prepare_inputs(max_batch_size, max_input_len, max_new_tokens, use_cache, max_beam_width, max_num_tokens: int | None = None)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.GPTJModel(num_layers, num_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, rotary_dim, dtype=None, mapping=<tensorrt_llm.mapping.Mapping object>, quant_mode=QuantMode.None)[source]

Bases: Module

forward(input_ids: Tensor, use_cache=False, kv_cache_params=None, attention_params=None)[source]
class tensorrt_llm.models.GPTLMHeadModel(num_layers, num_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, dtype, logits_dtype='float32', mapping=<tensorrt_llm.mapping.Mapping object>, apply_query_key_layer_scaling=False, position_embedding_type=PositionEmbeddingType.learned_absolute, rotary_embedding_percentage=1.0, rotary_base=10000.0, rotary_scaling=None, inter_size=None, bias=True, quant_mode=QuantMode.None, num_kv_heads=None, use_prompt_tuning=False, use_parallel_embedding=False, embedding_sharding_dim=0, moe_config=MoeConfig(num_experts=0, top_k=0, tp_mode=<ParallelismMode.TENSOR_PARALLEL: 2>, normalization_mode=<ExpertScaleNormalizationMode.RENORMALIZE: 1>), share_embedding_table=False)[source]

Bases: GPTModel, GenerationMixin

forward(input_ids: Tensor, position_ids=None, use_cache=False, last_token_ids=None, attention_mask=None, kv_cache_params=None, attention_params=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, workspace=None, lora_params=None)[source]
prepare_inputs(max_batch_size, max_input_len, max_new_tokens, use_cache, max_beam_width: int = 1, max_num_tokens: int | None = None, prompt_embedding_table_size: int = 0, gather_all_token_logits: bool = False, max_draft_len: int = 0, lora_target_modules: List[str] | None = None)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.GPTModel(num_layers, num_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, dtype=None, mapping=<tensorrt_llm.mapping.Mapping object>, apply_query_key_layer_scaling=False, position_embedding_type=PositionEmbeddingType.learned_absolute, rotary_embedding_percentage=1.0, rotary_base=10000.0, rotary_scaling=None, inter_size=None, bias=True, quant_mode=QuantMode.None, num_kv_heads=None, use_prompt_tuning=False, use_parallel_embedding=False, embedding_sharding_dim=0, moe_config=MoeConfig(num_experts=0, top_k=0, tp_mode=<ParallelismMode.TENSOR_PARALLEL: 2>, normalization_mode=<ExpertScaleNormalizationMode.RENORMALIZE: 1>))[source]

Bases: Module

forward(input_ids, position_ids, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, workspace=None, lora_params=None)[source]
class tensorrt_llm.models.GPTNeoXForCausalLM(num_layers, num_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, rotary_dim, dtype, position_embedding_type=PositionEmbeddingType.rope_gpt_neox, mapping=<tensorrt_llm.mapping.Mapping object>, apply_query_key_layer_scaling=False, use_parallel_embedding=False, embedding_sharding_dim=0)[source]

Bases: GPTNeoXModel, GenerationMixin

forward(input_ids: Tensor, position_ids=None, use_cache=False, last_token_ids=None, kv_cache_params=None, attention_params=None)[source]
prepare_inputs(max_batch_size, max_input_len, max_new_tokens, use_cache, max_beam_width)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.GPTNeoXModel(num_layers, num_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, rotary_dim, dtype=None, position_embedding_type=PositionEmbeddingType.rope_gpt_neox, mapping=<tensorrt_llm.mapping.Mapping object>, apply_query_key_layer_scaling=False, use_parallel_embedding=False, embedding_sharding_dim=0)[source]

Bases: Module

forward(input_ids: Tensor, position_ids=None, use_cache=False, kv_cache_params=None, attention_params=None)[source]
class tensorrt_llm.models.LLaMAForCausalLM(num_layers, num_heads, num_kv_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, dtype, logits_dtype='float32', mlp_hidden_size=None, position_embedding_type=PositionEmbeddingType.rope_gpt_neox, rotary_base=10000.0, rotary_scaling=None, mapping=<tensorrt_llm.mapping.Mapping object>, quant_mode=QuantMode.None, use_parallel_embedding=False, embedding_sharding_dim=0, rms_norm_eps=1e-06, use_fused_mlp=False, attn_bias=False, mlp_bias=False, moe_config=MoeConfig(num_experts=0, top_k=0, tp_mode=<ParallelismMode.TENSOR_PARALLEL: 2>, normalization_mode=<ExpertScaleNormalizationMode.RENORMALIZE: 1>), use_prompt_tuning: bool = False)[source]

Bases: LLaMAModel, GenerationMixin

forward(input_ids, position_ids=None, use_cache=False, last_token_ids=None, attention_mask=None, kv_cache_params=None, attention_params=None, hidden_states=None, all_reduce_workspace=None, prompt_embedding_table: Tensor | None = None, prompt_tasks: Tensor | None = None, prompt_vocab_size: Tensor | None = None, lora_params=None)[source]
prepare_inputs(max_batch_size, max_input_len, max_new_tokens, use_cache, max_beam_width, max_num_tokens: int | None = None, prompt_embedding_table_size: int = 0, gather_all_token_logits: bool = False, lora_target_modules: List[str] | None = None)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.LLaMAModel(num_layers, num_heads, num_kv_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, dtype, mlp_hidden_size=None, position_embedding_type=PositionEmbeddingType.rope_gpt_neox, rotary_base=10000.0, rotary_scaling=None, mapping=<tensorrt_llm.mapping.Mapping object>, quant_mode=QuantMode.None, use_parallel_embedding=False, embedding_sharding_dim=0, rms_norm_eps=1e-06, use_fused_mlp=False, attn_bias=False, mlp_bias=False, moe_config: ~tensorrt_llm.layers.moe.MoeConfig = MoeConfig(num_experts=0, top_k=0, tp_mode=<ParallelismMode.TENSOR_PARALLEL: 2>, normalization_mode=<ExpertScaleNormalizationMode.RENORMALIZE: 1>), use_prompt_tuning: bool = False)[source]

Bases: Module

forward(input_ids, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, hidden_states=None, all_reduce_workspace=None, prompt_embedding_table: Tensor | None = None, prompt_tasks: Tensor | None = None, prompt_vocab_size: Tensor | None = None, lora_params=None)[source]
class tensorrt_llm.models.OPTForCausalLM(*args, **kwargs)[source]

Bases: DecoderModelForCausalLM

check_config()[source]
class tensorrt_llm.models.OPTModel(config)[source]

Bases: Module

forward(input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None)[source]
class tensorrt_llm.models.PretrainedConfig(architecture: str, dtype: str, logits_dtype: str, vocab_size: int, max_position_embeddings: int, hidden_size: int, num_hidden_layers: int, num_attention_heads: int, num_key_value_heads: int, hidden_act: str, intermediate_size: int, norm_epsilon: float, position_embedding_type: str, world_size: int, tp_size: int, pp_size: int, quant_mode: QuantMode, quant_kwargs: dict, use_prompt_tuning: bool, **kwargs)[source]

Bases: object

classmethod from_dict(config)[source]
classmethod from_json_file(config_file: str)[source]
set_if_not_exist(key, value)[source]
set_rank(rank)[source]
to_dict()[source]
class tensorrt_llm.models.PretrainedModel(*args, **kwargs)[source]

Bases: Module, GenerationMixin

check_config()[source]
classmethod from_checkpoint(ckpt_dir: str, rank: int = 0)[source]
classmethod from_config(config: PretrainedConfig)[source]
load(weights)[source]
prepare_inputs(max_batch_size, max_input_len, max_new_tokens, use_cache, max_beam_width: int = 1, max_num_tokens: int | None = None, prompt_embedding_table_size: int = 0, gather_all_token_logits: bool = False, lora_target_modules: List[str] | None = None)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.QWenForCausalLM(num_layers, num_heads, num_kv_heads, hidden_size, seq_length, vocab_size, hidden_act, max_position_embeddings, dtype, logits_dtype='float32', mlp_hidden_size=None, neox_rotary_style=True, rotary_base=10000.0, rotary_scaling=None, mapping=<tensorrt_llm.mapping.Mapping object>, quant_mode=QuantMode.None, use_parallel_embedding=False, embedding_sharding_dim=0, rms_norm_eps=1e-06)[source]

Bases: QWenModel, GenerationMixin

forward(input_ids, position_ids=None, use_cache=False, last_token_ids=None, kv_cache_params=None, attention_params=None, hidden_states=None, all_reduce_workspace=None)[source]
prepare_inputs(max_batch_size, max_input_len, max_new_tokens, use_cache, max_beam_width: int = 1, max_num_tokens: int | None = None)[source]

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.WhisperEncoder(n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int, dtype)[source]

Bases: Module

forward(x: Tensor)[source]
prepare_inputs(max_batch_size=16)[source]
tensorrt_llm.models.quantize_model(model: Module, quant_mode: QuantMode, **kwargs: Any)[source]