Models#

class tensorrt_llm.models.BertModel(*args, **kwargs)[source]#

Bases: BertBase

forward( input_ids=None, input_lengths=None, position_ids=None, token_type_ids=None, hidden_states=None, max_input_length=None, )[source]#

class tensorrt_llm.models.BertForQuestionAnswering(*args, **kwargs)[source]#

Bases: BertBase

forward( input_ids=None, input_lengths=None, token_type_ids=None, position_ids=None, hidden_states=None, max_input_length=None, )[source]#

class tensorrt_llm.models.BertForSequenceClassification(*args, **kwargs)[source]#

Bases: BertBase

forward( input_ids, input_lengths, token_type_ids=None, position_ids=None, hidden_states=None, max_input_length=None, )[source]#

tensorrt_llm.models.RobertaModel#: alias of BertModel

tensorrt_llm.models.RobertaForQuestionAnswering#: alias of BertForQuestionAnswering

tensorrt_llm.models.RobertaForSequenceClassification#: alias of BertForSequenceClassification

class tensorrt_llm.models.BloomModel( config: PretrainedConfig, )[source]#

Bases: Module

forward( input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, attention_params=None, )[source]#

class tensorrt_llm.models.BloomForCausalLM(*args, **kwargs)[source]#: Bases: DecoderModelForCausalLM

class tensorrt_llm.models.CLIPVisionTransformer( image_size, num_channels, patch_size, hidden_size, num_attention_heads, max_position_embeddings, norm_epsilon, intermediate_size, hidden_act, num_hidden_layers, require_ln_f, mapping: Mapping, dtype, )[source]#

Bases: Module

forward(pixel_values)[source]#

class tensorrt_llm.models.DiT(*args, **kwargs)[source]#

Bases: PretrainedModel

check_config( config: PretrainedConfig, )[source]#

unpatchify(x: Tensor)[source]#

forward(latent, timestep, label)[source]#: Forward pass of DiT. latent: (N, C, H, W) timestep: (N,) label: (N,)

forward_without_cfg(x, t, y)[source]#: Forward pass without classifier-free guidance.

forward_with_cfg(x, t, y)[source]#: Forward pass with classifier-free guidance.

prepare_inputs(max_batch_size, **kwargs)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.SD3Transformer2DModel(*args, **kwargs)[source]#

Bases: PretrainedModel

config_class#: alias of SD3Transformer2DModelConfig

forward( hidden_states: Tensor, encoder_hidden_states: Tensor | None = None, pooled_projections: Tensor | None = None, timestep: Tensor | None = None, block_controlnet_hidden_states: List[Tensor] = None, joint_attention_kwargs: Dict[str, Any] | None = None, )[source]#

prepare_inputs(max_batch_size, **kwargs)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

classmethod from_pretrained(

pretrained_model_name_or_path: str,

dtype='float16',

mapping=<tensorrt_llm.mapping.Mapping object>,

**kwargs,

)[source]#

load(weights, from_pruned=False)[source]#

enable_forward_chunking( chunk_size: int | None = None, dim: int = 0, )[source]#

disable_forward_chunking()[source]#

property attn_processors#

set_attn_processor(processor)[source]#

fuse_qkv_projections()[source]#

unfuse_qkv_projections()[source]#

class tensorrt_llm.models.DeepseekForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of DeepSeekV1Config

classmethod from_hugging_face(

model_dir,

dtype: str = 'auto',

mapping: Mapping | None = None,

override_fields={},

**kwargs,

)[source]#

Create LLM object and load weights from hugging face.

Parameters:

hf_model_dir – the hugging face model directory
dtype – str, the default weights data type when loading from the hugging face model
mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

class tensorrt_llm.models.FalconConfig(

*,

bias: bool = False,

parallel_attention: bool = False,

num_ln_in_parallel_attn: int | None = None,

new_decoder_architecture: bool = False,

rotary_base: float = 10000.0,

**kwargs,

)[source]#

Bases: PretrainedConfig

to_dict()[source]#

classmethod from_hugging_face(

hf_config_or_dir: str | transformers.PretrainedConfig,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#

class tensorrt_llm.models.DeepseekV2ForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of DeepSeekV2Config

classmethod from_hugging_face(

model_dir,

dtype: str = 'auto',

hf_model: PreTrainedModel | None = None,

use_preloading: bool = False,

use_safetensors_loading: bool = False,

mapping: Mapping | None = None,

override_fields={},

**kwargs,

)[source]#

Create LLM object and load weights from hugging face.

Parameters:

hf_model_dir – the hugging face model directory
dtype – str, the default weights data type when loading from the hugging face model
mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

class tensorrt_llm.models.FalconForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of FalconConfig

check_config(config)[source]#

classmethod from_hugging_face(

hf_model_or_dir: str | transformers.PreTrainedModel,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#: Create a FalconForCausalLM object from give parameters

class tensorrt_llm.models.FalconModel(config: FalconConfig)[source]#

Bases: Module

forward( input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, hidden_states=None, )[source]#

class tensorrt_llm.models.GPTConfig(

*,

gpt_variant: str = 'gpt2',

bias: bool = True,

q_scaling: float = 1.0,

embedding_scale: float | None = None,

apply_query_key_layer_scaling: bool = False,

rotary_pct: float = 1.0,

rotary_base: float = 10000.0,

rotary_scaling: dict | None = None,

inner_layernorm: bool = False,

norm_before_bmm1: bool = False,

moe: MoeConfig | dict | None = None,

**kwargs,

)[source]#

Bases: PretrainedConfig

to_dict()[source]#

classmethod from_hugging_face(

hf_config_or_dir: str | transformers.PretrainedConfig,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#

classmethod from_nemo(

nemo_ckpt_dir: str,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#

class tensorrt_llm.models.GPTModel(config: GPTConfig)[source]#

Bases: Module

forward( input_ids, position_ids, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, hidden_states=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, lora_params=None, spec_decoding_params=None, )[source]#

class tensorrt_llm.models.GPTForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of GPTConfig

classmethod from_hugging_face(

hf_model_or_dir: str | transformers.PreTrainedModel,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#: Create a LLaMAForCausalLM object from give parameters

classmethod quantize(

hf_model_dir: str,

output_dir: str,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

*,

device: str = 'cuda',

calib_dataset: str = 'cnn_dailymail',

calib_batches: int = 512,

calib_batch_size: int = 1,

calib_max_seq_length: int = 512,

random_seed: int = 1234,

tokenizer_max_seq_length: int = 2048,

**kwargs,

)[source]#

classmethod from_nemo(

nemo_ckpt_dir: str,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#

use_lora( lora_config: LoraConfig, )[source]#

Load lora weights from the give config to the module.

Parameters:: lora_config – the lora config

class tensorrt_llm.models.OPTForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

check_config(config)[source]#

class tensorrt_llm.models.OPTModel(config: PretrainedConfig)[source]#

Bases: Module

forward(

input_ids: Tensor,

position_ids=None,

use_cache=False,

attention_mask=None,

kv_cache_params=None,

attention_params=None,

prompt_embedding_table=None,

prompt_tasks=None,

prompt_vocab_size=None,

**kwargs,

)[source]#

class tensorrt_llm.models.LLaMAConfig(

*,

mlp_bias: bool = False,

attn_bias: bool = False,

rotary_base: float = 10000.0,

rotary_scaling: dict | None = None,

residual_mlp: bool = False,

disable_weight_only_quant_plugin: bool = False,

moe: MoeConfig | dict | None = None,

remove_duplicated_kv_heads: bool = False,

embedding_multiplier: float = 1.0,

attention_multiplier: float = 1.0,

residual_multiplier: float = 1.0,

output_multiplier_scale: float = 1.0,

**kwargs,

)[source]#

Bases: PretrainedConfig

to_dict()[source]#

classmethod from_hugging_face(

hf_config_or_dir: str | transformers.PretrainedConfig,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#

classmethod from_meta_ckpt(

meta_ckpt_dir: str,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#

class tensorrt_llm.models.LLaMAForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of LLaMAConfig

classmethod from_hugging_face(

hf_model_or_dir: str | PreTrainedModel,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#: Create a LLaMAForCausalLM object from give parameters

default_plugin_config(**kwargs)[source]#

Return the default plugin config for this model.

This is used when the plugin_config value is not given in to_trt() call. If users need to set different plugin configs, they can start from the return object and change it.

classmethod from_meta_ckpt(

meta_ckpt_dir: str,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#

classmethod quantize(

hf_model_dir: str,

output_dir: str,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

*,

device: str = 'cuda',

calib_dataset: str = 'cnn_dailymail',

calib_batches: int = 512,

calib_batch_size: int = 1,

calib_max_seq_length: int = 512,

random_seed: int = 1234,

tokenizer_max_seq_length: int = 2048,

**kwargs,

)[source]#

use_lora( lora_config: LoraConfig, )[source]#

Load lora weights from the give config to the module.

Parameters:: lora_config – the lora config

class tensorrt_llm.models.LLaMAModel(config: LLaMAConfig)[source]#

Bases: Module

forward( input_ids, position_ids=None, use_cache=False, attention_mask=None, spec_decoding_params=None, kv_cache_params=None, attention_params=None, hidden_states=None, hidden_states_for_embed=None, prompt_embedding_table: Tensor | None = None, prompt_tasks: Tensor | None = None, prompt_vocab_size: Tensor | None = None, lora_params=None, )[source]#

class tensorrt_llm.models.LlavaNextVisionWrapper(*args, **kwargs)[source]#

Bases: PretrainedModel

forward(pixel_values, position_ids=None)[source]#

classmethod from_hugging_face(

hf_model_dir: str,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#: Create a LlavaNextVisionWrapper object from give parameters

save_checkpoint(output_dir, save_config=True)[source]#

prepare_inputs(max_batch_size, **kwargs)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.LlavaNextVisionConfig(

*,

image_size: int,

patch_size: int,

text_hidden_size: int,

projector_hidden_act: str = 'gelu',

num_channels: int = 3,

vision_model_type: str = 'clip_vision_model',

**kwargs,

)[source]#

Bases: PretrainedConfig

classmethod from_hugging_face(

hf_config_or_dir: str | transformers.PretrainedConfig,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#

class tensorrt_llm.models.MedusaConfig(

*,

num_medusa_heads: int = 4,

num_medusa_layers: int = 1,

max_draft_len: int = 63,

**kwargs,

)[source]#

Bases: PretrainedConfig

to_dict()[source]#

classmethod from_hugging_face(

hf_config_or_dir: str | transformers.PretrainedConfig,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#

class tensorrt_llm.models.MedusaForCausalLm(*args, **kwargs)[source]#

Bases: PretrainedModel

config_class#: alias of MedusaConfig

classmethod from_hugging_face(

hf_model_or_dir: str | transformers.PreTrainedModel,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#

Create LLM object and load weights from hugging face.

Parameters:

hf_model_dir – the hugging face model directory
dtype – str, the default weights data type when loading from the hugging face model
mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

class tensorrt_llm.models.ReDrafterForLLaMALM(*args, **kwargs)[source]#

Bases: ReDrafterMixin, LLaMAForCausalLM

ReDrafter implementation for LLaMA models.

Combines: - Base LLaMA model functionality from LLaMAForCausalLM - Drafting/speculative decoding logic from ReDrafterMixin

class tensorrt_llm.models.ReDrafterForQWenLM(*args, **kwargs)[source]#

Bases: ReDrafterMixin, QWenForCausalLM

ReDrafter implementation for QWen models.

Combines: - Base QWen model functionality from QWenForCausalLM - Drafting/speculative decoding logic from ReDrafterMixin

class tensorrt_llm.models.GPTJConfig(*, rotary_dim: int = 64, **kwargs)[source]#

Bases: PretrainedConfig

This is the configuration class to store the configuration of GPTJ model.

to_dict()[source]#

classmethod from_hugging_face(

hf_config_or_dir: str | transformers.PretrainedConfig,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#

class tensorrt_llm.models.GPTJModel(config: GPTJConfig)[source]#

Bases: Module

forward( input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, )[source]#

class tensorrt_llm.models.GPTJForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of GPTJConfig

classmethod from_hugging_face(

hf_model_or_dir: str | transformers.PreTrainedModel,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config=None,

**kwargs,

)[source]#

Create LLM object and load weights from hugging face.

Parameters:

hf_model_dir – the hugging face model directory
dtype – str, the default weights data type when loading from the hugging face model
mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

class tensorrt_llm.models.GPTNeoXModel( config: PretrainedConfig, )[source]#

Bases: Module

forward( input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, )[source]#

class tensorrt_llm.models.GPTNeoXForCausalLM(*args, **kwargs)[source]#: Bases: DecoderModelForCausalLM

class tensorrt_llm.models.PhiModel(config: PretrainedConfig)[source]#

Bases: Module

forward( input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, lora_params=None, )[source]#

class tensorrt_llm.models.Phi3Model( config: PretrainedConfig, )[source]#

Bases: Module

forward( input_ids: Tensor, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, lora_params=None, )[source]#

class tensorrt_llm.models.PhiForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of PhiConfig

check_config(config)[source]#

Create LLM object and load weights from hugging face.

Parameters:

hf_model_dir – the hugging face model directory
dtype – str, the default weights data type when loading from the hugging face model
mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

use_lora( lora_config: LoraConfig, )[source]#

Load lora weights from the give config to the module.

Parameters:: lora_config – the lora config

class tensorrt_llm.models.Phi3ForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of Phi3Config

Create LLM object and load weights from hugging face.

Parameters:

hf_model_dir – the hugging face model directory
dtype – str, the default weights data type when loading from the hugging face model
mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

use_lora( lora_config: LoraConfig, )[source]#

Load lora weights from the give config to the module.

Parameters:: lora_config – the lora config

class tensorrt_llm.models.ChatGLMConfig(

*,

chatglm_version: str = 'chatglm3',

add_bias_linear: bool = False,

add_qkv_bias: bool = True,

apply_query_key_layer_scaling: bool = False,

apply_residual_connection_post_layernorm: bool = False,

rmsnorm: bool = True,

rotary_pct: float = 0.5,

rotary_base: float = 10000.0,

rotary_scaling: dict | None = None,

**kwargs,

)[source]#

Bases: PretrainedConfig

to_dict()[source]#

classmethod from_hugging_face(

hf_config_or_dir: str | transformers.PretrainedConfig,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#

class tensorrt_llm.models.ChatGLMForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of ChatGLMConfig

classmethod from_hugging_face(

hf_model_or_dir: str | transformers.PreTrainedModel,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#: Create a LLaMAForCausalLM object from give parameters

classmethod quantize(

hf_model_dir: str,

output_dir: str,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

*,

device: str = 'cuda',

calib_dataset: str = 'cnn_dailymail',

calib_batches: int = 512,

calib_batch_size: int = 1,

calib_max_seq_length: int = 512,

random_seed: int = 1234,

tokenizer_max_seq_length: int = 2048,

**kwargs,

)[source]#

prepare_inputs(*args, **kwargs)[source]#: See PretrainedModel.prepare_inputs for the detailed parameter list.

class tensorrt_llm.models.ChatGLMModel( config: ChatGLMConfig, )[source]#

Bases: Module

forward( input_ids: Tensor = None, position_ids: Tensor = None, use_cache: bool = False, attention_mask: Tensor = None, kv_cache_params: KeyValueCacheParams = None, attention_params: AttentionParams = None, )[source]#

class tensorrt_llm.models.BaichuanForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of BaichuanConfig

classmethod from_hugging_face(

hf_model_or_dir: str | transformers.PreTrainedModel,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#: Create a BaichuanForCausalLM object from give parameters

classmethod quantize(

hf_model_dir: str,

output_dir: str,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

*,

device: str = 'cuda',

calib_dataset: str = 'cnn_dailymail',

calib_batches: int = 512,

calib_batch_size: int = 1,

calib_max_seq_length: int = 512,

random_seed: int = 1234,

tokenizer_max_seq_length: int = 2048,

**kwargs,

)[source]#

class tensorrt_llm.models.EncoderModel(*args, **kwargs)[source]#

Bases: PretrainedModel

check_config( config: PretrainedConfig, )[source]#

forward( input_ids: Tensor, input_lengths=None, position_ids=None, token_type_ids=None, hidden_states=None, max_input_length=None, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, attention_mask=None, lora_params: LoraParams = None, language_adapter_routings: Tensor | None = None, )[source]#

prepare_inputs(

max_batch_size,

max_input_len,

prompt_embedding_table_size: int = 0,

lora_target_modules: List[str] = None,

*args,

**kwargs,

)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

use_lora( lora_config: LoraConfig, )[source]#

Load lora weights from the give config to the module.

Parameters:: lora_config – the lora config

use_prompt_tuning()[source]#: Enable p tuning when build the TRT engine, call this before to_trt.

precompute_relative_attention_bias(build_config)[source]#

class tensorrt_llm.models.DecoderModel(*args, **kwargs)[source]#

Bases: PretrainedModel

check_config( config: PretrainedConfig, )[source]#

forward( decoder_input_ids: Tensor, encoder_output: Tensor, position_ids=None, token_type_ids=None, use_cache=False, attention_mask_params=None, last_token_ids=None, kv_cache_params=None, attention_params=None, hidden_states=None, lora_params: LoraParams = None, cross_kv_cache_gen: Tensor | None = None, cross_kv_reuse: Tensor | None = None, language_adapter_routings: Tensor | None = None, )[source]#

prepare_inputs(

max_batch_size,

max_beam_width,

max_decoder_input_len,

max_seq_len,

max_encoder_input_len,

gather_context_logits: bool = False,

lora_target_modules: List[str] = None,

use_cache=True,

*args,

**kwargs,

)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

use_lora( lora_config: LoraConfig, )[source]#

Load lora weights from the give config to the module.

Parameters:: lora_config – the lora config

precompute_relative_attention_bias(build_config)[source]#

class tensorrt_llm.models.PretrainedConfig(

*,

architecture: str,

dtype: str,

hidden_size: int,

num_hidden_layers: int,

num_attention_heads: int,

vocab_size: int | None = None,

hidden_act: str = 'gelu',

logits_dtype: str = 'float32',

norm_epsilon: float = 1e-05,

position_embedding_type: PositionEmbeddingType | str = PositionEmbeddingType.learned_absolute,

max_position_embeddings: int | None = None,

rotary_embedding_dim: int | None = None,

num_key_value_heads: int | None = None,

intermediate_size: int | None = None,

mapping: Mapping | dict | None = None,

quantization: QuantConfig | dict | None = None,

use_parallel_embedding: bool = False,

embedding_sharding_dim: int = 0,

head_size: int | None = None,

qk_layernorm: bool = False,

runtime_defaults: RuntimeDefaultsIn = None,

**kwargs,

)[source]#

Bases: object

static create_runtime_defaults( defaults: RuntimeDefaultsIn = None, ) → RuntimeDefaults | None[source]#

property kv_dtype#

set_if_not_exist(key, value)[source]#

classmethod from_dict(config: dict)[source]#

to_dict()[source]#

classmethod from_json_file(config_file: str)[source]#

classmethod from_checkpoint(ckpt_dir: str)[source]#

to_json_file(config_file: str)[source]#

to_layer_quant_config(config_file: str)[source]#

property quant_mode#

property quant_algo#

set_rank(rank: int)[source]#

get_config_group(group_cls: Type[CG]) → CG[source]#

has_config_group(group_cls: Type[CG]) → bool[source]#

for_each_rank() → Generator[Self, None, None][source]#

class tensorrt_llm.models.PretrainedModel(*args, **kwargs)[source]#

Bases: Module, GenerationMixin, TopModelMixin

release()[source]#

check_config(config)[source]#

classmethod from_config( config: PretrainedConfig, )[source]#

classmethod from_checkpoint( ckpt_dir: str, rank: int | None = None, config: PretrainedConfig | None = None, *, preprocess_weights_hook: Callable[[Dict[str, Tensor]], Dict[str, Tensor]] | None = None, )[source]#

load(weights, from_pruned=False)[source]#

save_checkpoint(output_dir, save_config=True)[source]#

prepare_inputs( max_batch_size, max_input_len, max_seq_len, max_num_tokens, use_cache, max_beam_width: int = 1, opt_num_tokens: int = None, prompt_embedding_table_size: int = 0, position_encoding_2d: bool = False, max_draft_len: int = 0, speculative_decoding_draft_tokens_external: bool = False, spec_decoding_is_generation_length_variable: bool = False, gather_context_logits: bool = False, lora_target_modules: List[str] = None, opt_batch_size: int = 0, num_hidden_layers: int = None, mrope_rotary_cos_sin_size: int = None, )[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

classmethod quantize(

hf_model_dir: str,

output_dir: str,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

*,

device: str = 'cuda',

calib_dataset: str = 'cnn_dailymail',

calib_batches: int = 512,

calib_batch_size: int = 1,

calib_max_seq_length: int = 512,

random_seed: int = 1234,

tokenizer_max_seq_length: int = 2048,

**kwargs,

)[source]#

class tensorrt_llm.models.WhisperEncoder(*args, **kwargs)[source]#

Bases: PretrainedModel

forward( input_features: Tensor, input_lengths=None, position_ids=None, )[source]#

prepare_inputs(max_batch_size=16)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

precompute_relative_attention_bias(build_config)[source]#

class tensorrt_llm.models.MambaForCausalLM(*args, **kwargs)[source]#

Bases: PretrainedModel

config_class#: alias of MambaConfig

forward( input_ids, conv_states, ssm_states, host_request_types, last_token_ids, last_token_ids_for_logits, host_context_lengths, slot_mapping: Tensor | None = None, )[source]#

prepare_inputs( max_batch_size, max_input_len, max_seq_len, max_num_tokens, use_cache, max_beam_width: int = 1, opt_num_tokens: int = None, opt_batch_size: int = 0, prompt_embedding_table_size: int = 0, max_draft_len: int = 0, gather_context_logits: bool = False, lora_target_modules: List[str] = None, speculative_decoding_draft_tokens_external: bool = False, )[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

Create LLM object and load weights from hugging face.

Parameters:

hf_model_dir – the hugging face model directory
dtype – str, the default weights data type when loading from the hugging face model
mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

class tensorrt_llm.models.MPTForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

check_config(config)[source]#

class tensorrt_llm.models.MPTModel(config: PretrainedConfig)[source]#

Bases: Module

forward( input_ids, position_ids, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, )[source]#

class tensorrt_llm.models.GemmaConfig(

*,

architecture: str,

rotary_base: float = 10000.0,

rotary_scaling: dict | None = None,

attn_bias: bool = False,

mlp_bias: bool = False,

position_embedding_type: PositionEmbeddingType = PositionEmbeddingType.rope_gpt_neox,

query_pre_attn_scalar: int | None = None,

final_logit_softcapping: float | None = None,

attn_logit_softcapping: float | None = None,

mapping: Mapping | dict | None = None,

_sliding_window_pattern: int = None,

rope_local_base_freq: int = None,

sliding_window: int = None,

**kwargs,

)[source]#

Bases: PretrainedConfig

GEMMA_ADDED_FIELDS = {'attn_bias', 'inter_layernorms', 'mlp_bias', 'rotary_base', 'rotary_scaling'}#

GEMMA2_ADDED_FIELDS = {'attn_logit_softcapping', 'final_logit_softcapping', 'query_pre_attn_scalar'}#

GEMMA3_ADDED_FIELDS = {'_sliding_window_pattern', 'final_logit_softcapping', 'query_pre_attn_scalar', 'rope_local_base_freq', 'sliding_window'}#

VERBATIM = {'_sliding_window_pattern', 'attn_logit_softcapping', 'final_logit_softcapping', 'hidden_act', 'hidden_size', 'intermediate_size', 'max_position_embeddings', 'num_attention_heads', 'num_hidden_layers', 'query_pre_attn_scalar', 'rope_local_base_freq', 'sliding_window', 'use_parallel_embedding', 'vocab_size'}#

property is_gemma_2: bool#

gemma2_config()[source]#

property is_gemma_3: bool#

gemma3_config()[source]#

to_dict()[source]#: Serialize the fields added in GemmaConfig

static get_hf_config(config_dir: str | PathLike)[source]#

classmethod from_hugging_face(

hf_config_or_dir: HfConfigOrDir,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

) → GemmaConfig[source]#

class tensorrt_llm.models.GemmaForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of GemmaConfig

classmethod from_hugging_face(

hf_model_dir: HfConfigOrDir,

dtype='float16',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

load_model_on_cpu: bool = True,

**kwargs,

)[source]#

Create LLM object and load weights from hugging face.

Parameters:

hf_model_dir – the hugging face model directory
dtype – str, the default weights data type when loading from the hugging face model
mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

NATIVE_QUANT_FLOW = {QuantAlgo.W4A16, QuantAlgo.W8A16, QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN, QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN, QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN, QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN}#

classmethod assert_valid_quant_algo( quant_algo: QuantAlgo | None, )[source]#

classmethod quantize(

hf_model_dir: str,

output_dir: str,

dtype: str = 'float16',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

*,

gemma_config_kwargs: Dict[str, Any] = None,

**quantize_kwargs: Dict[str, Any],

)[source]#

use_lora( lora_config: LoraConfig, ) → None[source]#

Load lora weights from the give config to the module.

Parameters:: lora_config – the lora config

class tensorrt_llm.models.DbrxConfig(

*,

bias: bool = False,

clip_qkv: float | None = None,

rotary_base: float = 500000.0,

rotary_scaling: dict | None = None,

moe: MoeConfig | dict | None = None,

**kwargs,

)[source]#

Bases: PretrainedConfig

to_dict()[source]#

class tensorrt_llm.models.DbrxForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of DbrxConfig

class tensorrt_llm.models.RecurrentGemmaForCausalLM(*args, **kwargs)[source]#

Bases: PretrainedModel

forward( input_ids, position_ids=None, use_cache=False, attention_mask=None, kv_cache_params=None, attention_params=None, conv_states=None, rnn_states=None, host_request_types=None, last_token_ids=None, last_token_ids_for_logits=None, host_context_lengths=None, slot_mapping=None, )[source]#

prepare_recurrent_inputs( max_batch_size, num_profiles, mapping, )[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

class tensorrt_llm.models.CogVLMConfig(

*,

mlp_bias: bool = False,

attn_bias: bool = False,

rotary_base: float = 10000.0,

rotary_scaling: dict | None = None,

**kwargs,

)[source]#

Bases: PretrainedConfig

to_dict()[source]#

class tensorrt_llm.models.CogVLMForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM, TopModelMixin

config_class#: alias of CogVLMConfig

classmethod from_hugging_face(

hf_model_dir,

dtype='float16',

mapping: Mapping | None = None,

quant_mode: QuantMode | None = None,

**kwargs,

)[source]#

Create LLM object and load weights from hugging face.

Parameters:

hf_model_dir – the hugging face model directory
dtype – str, the default weights data type when loading from the hugging face model
mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

default_plugin_config(**kwargs)[source]#

Return the default plugin config for this model.

This is used when the plugin_config value is not given in to_trt() call. If users need to set different plugin configs, they can start from the return object and change it.

classmethod quantize(

hf_model_dir,

output_dir,

quant_config: QuantConfig,

*,

dtype='float16',

mapping: Mapping | None = None,

calib_batches=512,

calib_batch_size=1,

random_seed=1234,

tokenizer_max_seq_length=2048,

**kwargs,

)[source]#

class tensorrt_llm.models.EagleForCausalLM(*args, **kwargs)[source]#

Bases: LLaMAForCausalLM

config_class#: alias of EagleConfig

forward(*args, **kwargs)[source]#

prepare_inputs(*args, **kwargs)[source]#

Inputs needed:

device_request_types: [bs] draft_tokens: [bs, max_draft_len] draft_lens: [bs] spec_decoding_generation_lengths: [bs] spec_decoding_position_offsets: [bs, max_gen_tokens] spec_decoding_packed_mask: [bs, max_draft_len, packed_length] ** eagle_temperature: [bs] rand_data_validation: [bs, max_draft_len]

** The mask is tricky since the boolean mask will need to be

packed in runtime. So, the last dim will be:: packed_length = ceil((max_draft_len+1)/32)

classmethod from_hugging_face(

hf_model_or_dir: str | transformers.PreTrainedModel,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#: Create a LLaMAForCausalLM object from give parameters

class tensorrt_llm.models.SpeculativeDecodingMode( value, names=<not given>, *values, module=None, qualname=None, type=None, start=1, boundary=None, )[source]#

Bases: IntFlag

NONE = 1#

DRAFT_TOKENS_EXTERNAL = 2#

MEDUSA = 4#

LOOKAHEAD_DECODING = 8#

EXPLICIT_DRAFT_TOKENS = 16#

EAGLE = 32#

NGRAM = 64#

USER_PROVIDED = 128#

AUTO = 256#

static from_arguments(args: Namespace)[source]#

class tensorrt_llm.models.CohereForCausalLM(*args, **kwargs)[source]#

Bases: DecoderModelForCausalLM

config_class#: alias of CohereConfig

classmethod from_hugging_face(

hf_model_or_dir: str,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#: Create a CohereForCausalLM object from give parameters

class tensorrt_llm.models.MLLaMAForCausalLM(*args, **kwargs)[source]#

Bases: PretrainedModel

config_class#: alias of MLLaMAConfig

forward( decoder_input_ids: Tensor, encoder_output: Tensor, use_cache=False, attention_mask_params=None, last_token_ids=None, kv_cache_params=None, attention_params=None, hidden_states=None, lora_params: LoraParams = None, cross_kv_cache_gen: Tensor | None = None, cross_kv_reuse: Tensor | None = None, prompt_embedding_table: Tensor | None = None, prompt_tasks: Tensor | None = None, prompt_vocab_size: Tensor | None = None, skip_cross_attn_blocks: Tensor | None = None, )[source]#

prepare_inputs(

max_batch_size,

max_beam_width,

max_decoder_input_len,

max_seq_len,

max_encoder_input_len,

gather_context_logits: bool = False,

gather_generation_logits: bool = False,

lora_target_modules: List[str] = None,

prompt_embedding_table_size: int = 0,

use_cache=True,

*args,

**kwargs,

)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

use_lora( lora_config: LoraConfig, )[source]#

Load lora weights from the give config to the module.

Parameters:: lora_config – the lora config

classmethod from_hugging_face(

hf_model_or_dir: str | transformers.PreTrainedModel,

dtype: str = 'auto',

mapping: Mapping | None = None,

quant_config: QuantConfig | None = None,

**kwargs,

)[source]#: Create a MLLaMAForCausalLM object from give parameters