Models#
- class tensorrt_llm.models.BaichuanForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- config_class#
- alias of - BaichuanConfig
 - classmethod from_hugging_face(
- hf_model_or_dir: str | transformers.PreTrainedModel,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create a BaichuanForCausalLM object from give parameters 
 - classmethod quantize(
- hf_model_dir: str,
- output_dir: str,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- *,
- device: str = 'cuda',
- calib_dataset: str = 'cnn_dailymail',
- calib_batches: int = 512,
- calib_batch_size: int = 1,
- calib_max_seq_length: int = 512,
- random_seed: int = 1234,
- tokenizer_max_seq_length: int = 2048,
- **kwargs,
 
- class tensorrt_llm.models.BloomModel(
- config: PretrainedConfig,
- Bases: - Module
- class tensorrt_llm.models.CLIPVisionTransformer(
- image_size,
- num_channels,
- patch_size,
- hidden_size,
- num_attention_heads,
- max_position_embeddings,
- norm_epsilon,
- intermediate_size,
- hidden_act,
- num_hidden_layers,
- require_ln_f,
- mapping: Mapping,
- dtype,
- Bases: - Module
- class tensorrt_llm.models.ChatGLMConfig(
- *,
- chatglm_version: str = 'chatglm3',
- add_bias_linear: bool = False,
- add_qkv_bias: bool = True,
- apply_query_key_layer_scaling: bool = False,
- apply_residual_connection_post_layernorm: bool = False,
- rmsnorm: bool = True,
- rotary_pct: float = 0.5,
- rotary_base: float = 10000.0,
- rotary_scaling: dict | None = None,
- **kwargs,
- Bases: - PretrainedConfig- classmethod from_hugging_face(
- hf_config_or_dir: str | transformers.PretrainedConfig,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
 
- class tensorrt_llm.models.ChatGLMForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- config_class#
- alias of - ChatGLMConfig
 - classmethod from_hugging_face(
- hf_model_or_dir: str | transformers.PreTrainedModel,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create a LLaMAForCausalLM object from give parameters 
 - prepare_inputs(*args, **kwargs)[source]#
- See PretrainedModel.prepare_inputs for the detailed parameter list. 
 - classmethod quantize(
- hf_model_dir: str,
- output_dir: str,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- *,
- device: str = 'cuda',
- calib_dataset: str = 'cnn_dailymail',
- calib_batches: int = 512,
- calib_batch_size: int = 1,
- calib_max_seq_length: int = 512,
- random_seed: int = 1234,
- tokenizer_max_seq_length: int = 2048,
- **kwargs,
 
- class tensorrt_llm.models.ChatGLMModel(
- config: ChatGLMConfig,
- Bases: - Module- forward(
- input_ids: Tensor = None,
- position_ids: Tensor = None,
- use_cache: bool = False,
- attention_mask: Tensor = None,
- kv_cache_params: KeyValueCacheParams = None,
- attention_params: AttentionParams = None,
 
- class tensorrt_llm.models.CogVLMConfig(
- *,
- mlp_bias: bool = False,
- attn_bias: bool = False,
- rotary_base: float = 10000.0,
- rotary_scaling: dict | None = None,
- **kwargs,
- Bases: - PretrainedConfig
- class tensorrt_llm.models.CogVLMForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM,- TopModelMixin- config_class#
- alias of - CogVLMConfig
 - default_plugin_config(**kwargs)[source]#
- Return the default plugin config for this model, when the plugin_config value is not given in to_trt() call. If users need to set different plugin configs, they can start from the return object and change it. 
 - classmethod from_hugging_face(
- hf_model_dir,
- dtype='float16',
- mapping: Mapping | None = None,
- quant_mode: QuantMode | None = None,
- **kwargs,
- Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used 
 - classmethod quantize(
- hf_model_dir,
- output_dir,
- quant_config: QuantConfig,
- *,
- dtype='float16',
- mapping: Mapping | None = None,
- calib_batches=512,
- calib_batch_size=1,
- random_seed=1234,
- tokenizer_max_seq_length=2048,
- **kwargs,
 
- class tensorrt_llm.models.CohereForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- config_class#
- alias of - CohereConfig
 - classmethod from_hugging_face(
- hf_model_or_dir: str,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create a CohereForCausalLM object from give parameters 
 
- class tensorrt_llm.models.DbrxConfig(
- *,
- bias: bool = False,
- clip_qkv: float | None = None,
- rotary_base: float = 500000.0,
- rotary_scaling: dict | None = None,
- moe: MoeConfig | dict | None = None,
- **kwargs,
- Bases: - PretrainedConfig
- class tensorrt_llm.models.DbrxForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- config_class#
- alias of - DbrxConfig
 
- class tensorrt_llm.models.DecoderModel(*args, **kwargs)[source]#
- Bases: - PretrainedModel- check_config(
- config: PretrainedConfig,
 - forward(
- decoder_input_ids: Tensor,
- encoder_output: Tensor,
- position_ids=None,
- token_type_ids=None,
- use_cache=False,
- attention_mask_params=None,
- last_token_ids=None,
- kv_cache_params=None,
- attention_params=None,
- hidden_states=None,
- lora_params: LoraParams = None,
- cross_kv_cache_gen: Tensor | None = None,
- cross_kv_reuse: Tensor | None = None,
- language_adapter_routings: Tensor | None = None,
 - prepare_inputs(
- max_batch_size,
- max_beam_width,
- max_decoder_input_len,
- max_seq_len,
- max_encoder_input_len,
- gather_context_logits: bool = False,
- lora_target_modules: List[str] = None,
- use_cache=True,
- *args,
- **kwargs,
- @brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes. - @return: a list contains values which can be fed into the self.forward() 
 
- class tensorrt_llm.models.DeepseekForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- config_class#
- alias of - DeepSeekV1Config
 - classmethod from_hugging_face(
- model_dir,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- override_fields={},
- **kwargs,
- Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used 
 
- class tensorrt_llm.models.DeepseekV2ForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- config_class#
- alias of - DeepSeekV2Config
 - classmethod from_hugging_face(
- model_dir,
- dtype: str = 'auto',
- hf_model: PreTrainedModel | None = None,
- use_preloading: bool = False,
- use_safetensors_loading: bool = False,
- mapping: Mapping | None = None,
- override_fields={},
- **kwargs,
- Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used 
 
- class tensorrt_llm.models.DiT(*args, **kwargs)[source]#
- Bases: - PretrainedModel- check_config(
- config: PretrainedConfig,
 - forward(latent, timestep, label)[source]#
- Forward pass of DiT. latent: (N, C, H, W) timestep: (N,) label: (N,) 
 
- class tensorrt_llm.models.EagleForCausalLM(*args, **kwargs)[source]#
- Bases: - LLaMAForCausalLM- config_class#
- alias of - EagleConfig
 - classmethod from_hugging_face(
- hf_model_or_dir: str | transformers.PreTrainedModel,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create a LLaMAForCausalLM object from give parameters 
 - prepare_inputs(*args, **kwargs)[source]#
- Inputs needed:
- device_request_types: [bs] draft_tokens: [bs, max_draft_len] draft_lens: [bs] spec_decoding_generation_lengths: [bs] spec_decoding_position_offsets: [bs, max_gen_tokens] spec_decoding_packed_mask: [bs, max_draft_len, packed_length] ** eagle_temperature: [bs] rand_data_validation: [bs, max_draft_tokens] - ** The mask is tricky since the boolean mask will need to be
- packed in runtime. So, the last dim will be:
- packed_length = ceil((max_draft_tokens+1)/32) 
 
 
 
 
- class tensorrt_llm.models.EncoderModel(*args, **kwargs)[source]#
- Bases: - PretrainedModel- check_config(
- config: PretrainedConfig,
 - forward(
- input_ids: Tensor,
- input_lengths=None,
- position_ids=None,
- token_type_ids=None,
- hidden_states=None,
- max_input_length=None,
- prompt_embedding_table=None,
- prompt_tasks=None,
- prompt_vocab_size=None,
- attention_mask=None,
- lora_params: LoraParams = None,
- language_adapter_routings: Tensor | None = None,
 - prepare_inputs(
- max_batch_size,
- max_input_len,
- prompt_embedding_table_size: int = 0,
- lora_target_modules: List[str] = None,
- *args,
- **kwargs,
- @brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes. - @return: a list contains values which can be fed into the self.forward() 
 
- class tensorrt_llm.models.FalconConfig(
- *,
- bias: bool = False,
- parallel_attention: bool = False,
- num_ln_in_parallel_attn: int | None = None,
- new_decoder_architecture: bool = False,
- rotary_base: float = 10000.0,
- **kwargs,
- Bases: - PretrainedConfig- classmethod from_hugging_face(
- hf_config_or_dir: str | transformers.PretrainedConfig,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
 
- class tensorrt_llm.models.FalconForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- config_class#
- alias of - FalconConfig
 - classmethod from_hugging_face(
- hf_model_or_dir: str | transformers.PreTrainedModel,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create a FalconForCausalLM object from give parameters 
 
- class tensorrt_llm.models.FalconModel(config: FalconConfig)[source]#
- Bases: - Module
- class tensorrt_llm.models.GPTConfig(
- *,
- gpt_variant: str = 'gpt2',
- bias: bool = True,
- q_scaling: float = 1.0,
- embedding_scale: float | None = None,
- apply_query_key_layer_scaling: bool = False,
- rotary_pct: float = 1.0,
- rotary_base: float = 10000.0,
- rotary_scaling: dict | None = None,
- inner_layernorm: bool = False,
- norm_before_bmm1: bool = False,
- moe: MoeConfig | dict | None = None,
- **kwargs,
- Bases: - PretrainedConfig- classmethod from_hugging_face(
- hf_config_or_dir: str | transformers.PretrainedConfig,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
 - classmethod from_nemo(
- nemo_ckpt_dir: str,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
 
- class tensorrt_llm.models.GPTForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- classmethod from_hugging_face(
- hf_model_or_dir: str | transformers.PreTrainedModel,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create a LLaMAForCausalLM object from give parameters 
 - classmethod from_nemo(
- nemo_ckpt_dir: str,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
 - classmethod quantize(
- hf_model_dir: str,
- output_dir: str,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- *,
- device: str = 'cuda',
- calib_dataset: str = 'cnn_dailymail',
- calib_batches: int = 512,
- calib_batch_size: int = 1,
- calib_max_seq_length: int = 512,
- random_seed: int = 1234,
- tokenizer_max_seq_length: int = 2048,
- **kwargs,
 
- class tensorrt_llm.models.GPTJConfig(*, rotary_dim: int = 64, **kwargs)[source]#
- Bases: - PretrainedConfig- This is the configuration class to store the configuration of GPTJ model. - classmethod from_hugging_face(
- hf_config_or_dir: str | transformers.PretrainedConfig,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
 
- class tensorrt_llm.models.GPTJForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- config_class#
- alias of - GPTJConfig
 - classmethod from_hugging_face(
- hf_model_or_dir: str | transformers.PreTrainedModel,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config=None,
- **kwargs,
- Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used 
 
- class tensorrt_llm.models.GPTJModel(config: GPTJConfig)[source]#
- Bases: - Module
- class tensorrt_llm.models.GPTNeoXForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM
- class tensorrt_llm.models.GPTNeoXModel(
- config: PretrainedConfig,
- Bases: - Module
- class tensorrt_llm.models.GemmaConfig(
- *,
- architecture: str,
- rotary_base: float = 10000.0,
- rotary_scaling: dict | None = None,
- attn_bias: bool = False,
- mlp_bias: bool = False,
- position_embedding_type: PositionEmbeddingType = PositionEmbeddingType.rope_gpt_neox,
- query_pre_attn_scalar: int | None = None,
- final_logit_softcapping: float | None = None,
- attn_logit_softcapping: float | None = None,
- mapping: Mapping | dict | None = None,
- sliding_window_pattern: int = None,
- rope_local_base_freq: int = None,
- sliding_window: int = None,
- **kwargs,
- Bases: - PretrainedConfig- GEMMA2_ADDED_FIELDS = {'attn_logit_softcapping', 'final_logit_softcapping', 'query_pre_attn_scalar'}#
 - GEMMA3_ADDED_FIELDS = {'final_logit_softcapping', 'query_pre_attn_scalar', 'rope_local_base_freq', 'sliding_window', 'sliding_window_pattern'}#
 - GEMMA_ADDED_FIELDS = {'attn_bias', 'inter_layernorms', 'mlp_bias', 'rotary_base', 'rotary_scaling'}#
 - VERBATIM = {'attn_logit_softcapping', 'final_logit_softcapping', 'hidden_act', 'hidden_size', 'intermediate_size', 'max_position_embeddings', 'num_attention_heads', 'num_hidden_layers', 'query_pre_attn_scalar', 'rope_local_base_freq', 'sliding_window', 'sliding_window_pattern', 'use_parallel_embedding', 'vocab_size'}#
 - classmethod from_hugging_face(
- hf_config_or_dir: HfConfigOrDir,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
 - property is_gemma_2: bool#
 - property is_gemma_3: bool#
 
- class tensorrt_llm.models.GemmaForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- NATIVE_QUANT_FLOW = {QuantAlgo.W4A16, QuantAlgo.W8A16, QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN, QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN, QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN, QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN}#
 - config_class#
- alias of - GemmaConfig
 - classmethod from_hugging_face(
- hf_model_dir: HfConfigOrDir,
- dtype='float16',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- load_model_on_cpu: bool = True,
- **kwargs,
- Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used 
 - classmethod quantize(
- hf_model_dir: str,
- output_dir: str,
- dtype: str = 'float16',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- *,
- gemma_config_kwargs: Dict[str, Any] = None,
- **quantize_kwargs: Dict[str, Any],
 
- class tensorrt_llm.models.LLaMAConfig(
- *,
- mlp_bias: bool = False,
- attn_bias: bool = False,
- rotary_base: float = 10000.0,
- rotary_scaling: dict | None = None,
- residual_mlp: bool = False,
- disable_weight_only_quant_plugin: bool = False,
- moe: MoeConfig | dict | None = None,
- remove_duplicated_kv_heads: bool = False,
- embedding_multiplier: float = 1.0,
- attention_multiplier: float = 1.0,
- residual_multiplier: float = 1.0,
- output_multiplier_scale: float = 1.0,
- **kwargs,
- Bases: - PretrainedConfig- classmethod from_hugging_face(
- hf_config_or_dir: str | transformers.PretrainedConfig,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
 - classmethod from_meta_ckpt(
- meta_ckpt_dir: str,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
 
- class tensorrt_llm.models.LLaMAForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- config_class#
- alias of - LLaMAConfig
 - default_plugin_config(**kwargs)[source]#
- Return the default plugin config for this model, when the plugin_config value is not given in to_trt() call. If users need to set different plugin configs, they can start from the return object and change it. 
 - classmethod from_hugging_face(
- hf_model_or_dir: str | PreTrainedModel,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create a LLaMAForCausalLM object from give parameters 
 - classmethod from_meta_ckpt(
- meta_ckpt_dir: str,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
 - classmethod quantize(
- hf_model_dir: str,
- output_dir: str,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- *,
- device: str = 'cuda',
- calib_dataset: str = 'cnn_dailymail',
- calib_batches: int = 512,
- calib_batch_size: int = 1,
- calib_max_seq_length: int = 512,
- random_seed: int = 1234,
- tokenizer_max_seq_length: int = 2048,
- **kwargs,
 
- class tensorrt_llm.models.LLaMAModel(config: LLaMAConfig)[source]#
- Bases: - Module- forward(
- input_ids,
- position_ids=None,
- use_cache=False,
- attention_mask=None,
- spec_decoding_params=None,
- kv_cache_params=None,
- attention_params=None,
- hidden_states=None,
- hidden_states_for_embed=None,
- prompt_embedding_table: Tensor | None = None,
- prompt_tasks: Tensor | None = None,
- prompt_vocab_size: Tensor | None = None,
- lora_params=None,
 
- class tensorrt_llm.models.LlavaNextVisionConfig(
- *,
- image_size: int,
- patch_size: int,
- text_hidden_size: int,
- projector_hidden_act: str = 'gelu',
- num_channels: int = 3,
- vision_model_type: str = 'clip_vision_model',
- **kwargs,
- Bases: - PretrainedConfig- classmethod from_hugging_face(
- hf_config_or_dir: str | transformers.PretrainedConfig,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
 
- class tensorrt_llm.models.LlavaNextVisionWrapper(*args, **kwargs)[source]#
- Bases: - PretrainedModel- classmethod from_hugging_face(
- hf_model_dir: str,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create a LlavaNextVisionWrapper object from give parameters 
 
- class tensorrt_llm.models.MLLaMAForCausalLM(*args, **kwargs)[source]#
- Bases: - PretrainedModel- config_class#
- alias of - MLLaMAConfig
 - forward(
- decoder_input_ids: Tensor,
- encoder_output: Tensor,
- use_cache=False,
- attention_mask_params=None,
- last_token_ids=None,
- kv_cache_params=None,
- attention_params=None,
- hidden_states=None,
- lora_params: LoraParams = None,
- cross_kv_cache_gen: Tensor | None = None,
- cross_kv_reuse: Tensor | None = None,
- prompt_embedding_table: Tensor | None = None,
- prompt_tasks: Tensor | None = None,
- prompt_vocab_size: Tensor | None = None,
- skip_cross_attn_blocks: Tensor | None = None,
 - classmethod from_hugging_face(
- hf_model_or_dir: str | transformers.PreTrainedModel,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create a MLLaMAForCausalLM object from give parameters 
 - prepare_inputs(
- max_batch_size,
- max_beam_width,
- max_decoder_input_len,
- max_seq_len,
- max_encoder_input_len,
- gather_context_logits: bool = False,
- gather_generation_logits: bool = False,
- lora_target_modules: List[str] = None,
- prompt_embedding_table_size: int = 0,
- use_cache=True,
- *args,
- **kwargs,
- @brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes. - @return: a list contains values which can be fed into the self.forward() 
 
- class tensorrt_llm.models.MPTModel(config: PretrainedConfig)[source]#
- Bases: - Module
- class tensorrt_llm.models.MambaForCausalLM(*args, **kwargs)[source]#
- Bases: - PretrainedModel- config_class#
- alias of - MambaConfig
 - forward(
- input_ids,
- conv_states,
- ssm_states,
- host_request_types,
- last_token_ids,
- last_token_ids_for_logits,
- host_context_lengths,
- slot_mapping: Tensor | None = None,
 - classmethod from_hugging_face(
- hf_model_or_dir: str | transformers.PreTrainedModel,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used 
 - prepare_inputs(
- max_batch_size,
- max_input_len,
- max_seq_len,
- max_num_tokens,
- use_cache,
- max_beam_width: int = 1,
- opt_num_tokens: int = None,
- opt_batch_size: int = 0,
- prompt_embedding_table_size: int = 0,
- max_draft_len: int = 0,
- gather_context_logits: bool = False,
- lora_target_modules: List[str] = None,
- speculative_decoding_draft_tokens_external: bool = False,
- @brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes. - @return: a list contains values which can be fed into the self.forward() 
 
- class tensorrt_llm.models.MedusaConfig(
- *,
- num_medusa_heads: int = 4,
- num_medusa_layers: int = 1,
- max_draft_len: int = 63,
- **kwargs,
- Bases: - PretrainedConfig- classmethod from_hugging_face(
- hf_config_or_dir: str | transformers.PretrainedConfig,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
 
- class tensorrt_llm.models.MedusaForCausalLm(*args, **kwargs)[source]#
- Bases: - PretrainedModel- config_class#
- alias of - MedusaConfig
 - classmethod from_hugging_face(
- hf_model_or_dir: str | transformers.PreTrainedModel,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used 
 
- class tensorrt_llm.models.OPTModel(config: PretrainedConfig)[source]#
- Bases: - Module
- class tensorrt_llm.models.Phi3ForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- config_class#
- alias of - Phi3Config
 - classmethod from_hugging_face(
- hf_model_or_dir: str | transformers.PreTrainedModel,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used 
 
- class tensorrt_llm.models.Phi3Model(
- config: PretrainedConfig,
- Bases: - Module
- class tensorrt_llm.models.PhiForCausalLM(*args, **kwargs)[source]#
- Bases: - DecoderModelForCausalLM- config_class#
- alias of - PhiConfig
 - classmethod from_hugging_face(
- hf_model_or_dir: str | transformers.PreTrainedModel,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- **kwargs,
- Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used 
 
- class tensorrt_llm.models.PhiModel(config: PretrainedConfig)[source]#
- Bases: - Module
- class tensorrt_llm.models.PretrainedConfig(
- *,
- architecture: str,
- dtype: str,
- hidden_size: int,
- num_hidden_layers: int,
- num_attention_heads: int,
- vocab_size: int | None = None,
- hidden_act: str = 'gelu',
- logits_dtype: str = 'float32',
- norm_epsilon: float = 1e-05,
- position_embedding_type: PositionEmbeddingType | str = PositionEmbeddingType.learned_absolute,
- max_position_embeddings: int | None = None,
- rotary_embedding_dim: int | None = None,
- num_key_value_heads: int | None = None,
- intermediate_size: int | None = None,
- mapping: Mapping | dict | None = None,
- quantization: QuantConfig | dict | None = None,
- use_parallel_embedding: bool = False,
- embedding_sharding_dim: int = 0,
- head_size: int | None = None,
- qk_layernorm: bool = False,
- runtime_defaults: RuntimeDefaultsIn = None,
- **kwargs,
- Bases: - object- static create_runtime_defaults(
- defaults: RuntimeDefaultsIn = None,
 - property kv_dtype#
 - property quant_algo#
 - property quant_mode#
 
- class tensorrt_llm.models.PretrainedModel(*args, **kwargs)[source]#
- Bases: - Module,- GenerationMixin,- TopModelMixin- classmethod from_checkpoint(
- ckpt_dir: str,
- rank: int | None = None,
- config: PretrainedConfig | None = None,
- *,
- preprocess_weights_hook: Callable[[Dict[str, Tensor]], Dict[str, Tensor]] | None = None,
 - classmethod from_config(
- config: PretrainedConfig,
 - prepare_inputs(
- max_batch_size,
- max_input_len,
- max_seq_len,
- max_num_tokens,
- use_cache,
- max_beam_width: int = 1,
- opt_num_tokens: int = None,
- prompt_embedding_table_size: int = 0,
- position_encoding_2d: bool = False,
- max_draft_len: int = 0,
- speculative_decoding_draft_tokens_external: bool = False,
- spec_decoding_is_generation_length_variable: bool = False,
- gather_context_logits: bool = False,
- lora_target_modules: List[str] = None,
- opt_batch_size: int = 0,
- num_hidden_layers: int = None,
- mrope_rotary_cos_sin_size: int = None,
- @brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes. - @return: a list contains values which can be fed into the self.forward() 
 - classmethod quantize(
- hf_model_dir: str,
- output_dir: str,
- dtype: str = 'auto',
- mapping: Mapping | None = None,
- quant_config: QuantConfig | None = None,
- *,
- device: str = 'cuda',
- calib_dataset: str = 'cnn_dailymail',
- calib_batches: int = 512,
- calib_batch_size: int = 1,
- calib_max_seq_length: int = 512,
- random_seed: int = 1234,
- tokenizer_max_seq_length: int = 2048,
- **kwargs,
 
- class tensorrt_llm.models.ReDrafterForCausalLM(*args, **kwargs)[source]#
- Bases: - LLaMAForCausalLM- prepare_inputs(*args, **kwargs)[source]#
- Inputs needed:
- Assuming, max_gen_tokens = 1 + nb*(bl - 1), counting true token device_request_types: [bs] draft_tokens: [bs, nb, bl] draft_indices: [bs, nb, bl] draft_probs: [bs, nb, bl-1, V] spec_decoding_generation_lengths: [bs] spec_decoding_position_offsets: [bs, max_gen_tokens] spec_decoding_packed_mask: [bs, max_gen_tokens, packed_length] ** redrafter_inverted_temperature: [bs] rand_data_sample: [bs] rand_data_validation: [bs, nb, bl-1] - ** The mask is tricky since the boolean mask will need to be
- packed in runtime. So, the last dim will be:
- packed_length = ceil(max_gen_tokens/32) 
 
 
 
 
- class tensorrt_llm.models.RecurrentGemmaForCausalLM(*args, **kwargs)[source]#
- Bases: - PretrainedModel- forward(
- input_ids,
- position_ids=None,
- use_cache=False,
- attention_mask=None,
- kv_cache_params=None,
- attention_params=None,
- conv_states=None,
- rnn_states=None,
- host_request_types=None,
- last_token_ids=None,
- last_token_ids_for_logits=None,
- host_context_lengths=None,
- slot_mapping=None,
 - prepare_inputs(
- max_batch_size,
- max_input_len,
- max_seq_len,
- max_num_tokens,
- use_cache,
- max_beam_width: int = 1,
- opt_num_tokens: int = None,
- opt_batch_size: int = 0,
- prompt_embedding_table_size: int = 0,
- max_draft_len: int = 0,
- gather_context_logits: bool = False,
- lora_target_modules: List[str] = None,
- speculative_decoding_draft_tokens_external: bool = False,
- @brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes. - @return: a list contains values which can be fed into the self.forward() 
 
- tensorrt_llm.models.RobertaForQuestionAnswering#
- alias of - BertForQuestionAnswering
- tensorrt_llm.models.RobertaForSequenceClassification#
- alias of - BertForSequenceClassification
- class tensorrt_llm.models.SD3Transformer2DModel(*args, **kwargs)[source]#
- Bases: - PretrainedModel- property attn_processors#
 - config_class#
- alias of - SD3Transformer2DModelConfig
 - forward(
- hidden_states: Tensor,
- encoder_hidden_states: Tensor | None = None,
- pooled_projections: Tensor | None = None,
- timestep: Tensor | None = None,
- block_controlnet_hidden_states: List[Tensor] = None,
- joint_attention_kwargs: Dict[str, Any] | None = None,
 - classmethod from_pretrained(
- pretrained_model_name_or_path: str,
- dtype='float16',
- mapping=<tensorrt_llm.mapping.Mapping object>,
- **kwargs,
 
- class tensorrt_llm.models.SpeculativeDecodingMode(
- value,
- names=<not given>,
- *values,
- module=None,
- qualname=None,
- type=None,
- start=1,
- boundary=None,
- Bases: - IntFlag- DRAFT_TOKENS_EXTERNAL = 2#
 - EAGLE = 32#
 - EXPLICIT_DRAFT_TOKENS = 16#
 - LOOKAHEAD_DECODING = 8#
 - MEDUSA = 4#
 - NONE = 1#
 
- class tensorrt_llm.models.WhisperEncoder(*args, **kwargs)[source]#
- Bases: - PretrainedModel