
This module defines the model_config format.

This format can be converted from huggingface, nemo or modelopt-quantized model. And we will build tensorrt_llm engine from the context saved with this format.



class AttentionConfig

Bases: object

The attention layer config.

__init__(qkv=None, dense=None, kv_cache_scaling_factor=None, kv_cache_dtype=None, rotary_dim=-inf, clip_qkv=None, rel_attn_table=None)
  • qkv (QKVConfig | LinearConfig) –

  • dense (LinearConfig) –

  • kv_cache_scaling_factor (Tensor) –

  • kv_cache_dtype (str | None) –

  • rotary_dim (int) –

  • clip_qkv (float) –

  • rel_attn_table (Tensor) –

Return type:


clip_qkv: float = None
dense: LinearConfig = None
kv_cache_dtype: str | None = None
kv_cache_scaling_factor: Tensor = None
qkv: QKVConfig | LinearConfig = None
rel_attn_table: Tensor = None
rotary_dim: int = -inf
class ConvConfig

Bases: object

The Conv layer config.

__init__(weight=None, bias=None)
  • weight (Tensor) –

  • bias (Tensor) –

Return type:


bias: Tensor = None
weight: Tensor = None
class DecoderLayerConfig

Bases: object

The decoder layer config.

__init__(quantization=None, decoder_type='', input_layernorm=None, mlp_layernorm=None, attention=None, recurrent=None, post_layernorm=None, pre_feedforward_layernorm=None, post_feedforward_layernorm=None, mlp=None, num_attention_heads=0, attention_head_size=None, num_kv_heads=0, max_position_embeddings=0, rotary_pct=1.0, use_alibi=False, new_decoder_architecture=False, parallel_attention=False, apply_residual_connection_post_layernorm=False, use_cache=True, model_name='', rope_ratio=1.0, seq_length=0, qwen_type='', rotary_base=0, partial_rotary_factor=0, original_max_position_embeddings=0, longrope_scaling_short_factors=None, longrope_scaling_long_factors=None, mup_attn_multiplier=0, mup_embedding_multiplier=0, mup_use_scaling=0, mup_width_multiplier=0, blocksparse_block_size=0, blocksparse_homo_head_pattern=False, blocksparse_num_local_blocks=0, blocksparse_vertical_stride=0, dense_attention_every_n_layers=0, gegelu_limit=0, longrope_short_mscale=0, longrope_long_mscale=0, moe_num_experts=0, moe_top_k=0, moe_tp_mode=0, moe_renorm_mode=0, alibi_bias_max=0, residual_layernorm=None, residual_mlp=None, rnn_hidden_size=0, logits_soft_cap=0, emb_scale_by_sqrt_dim=False, layer_types=<factory>, final_logit_softcapping=0, attn_logit_softcapping=0, query_pre_attn_scalar=0, clip_qkv=0, cross_attention=None, cross_attention_layernorm=None, self_attention=None, self_attention_layernorm=None, attention_layernorm=None, rel_attn_max_distance=0, rel_attn_num_buckets=0, rope_scaling=None)
  • quantization (str | None) –

  • decoder_type (str) –

  • input_layernorm (LayernormConfig) –

  • mlp_layernorm (LayernormConfig) –

  • attention (AttentionConfig) –

  • recurrent (RecurrentConfig) –

  • post_layernorm (LayernormConfig) –

  • pre_feedforward_layernorm (LayernormConfig) –

  • post_feedforward_layernorm (LayernormConfig) –

  • mlp (MLPConfig | MOEConfig) –

  • num_attention_heads (int) –

  • attention_head_size (int) –

  • num_kv_heads (int) –

  • max_position_embeddings (int) –

  • rotary_pct (float) –

  • use_alibi (bool) –

  • new_decoder_architecture (bool) –

  • parallel_attention (bool) –

  • apply_residual_connection_post_layernorm (bool) –

  • use_cache (bool) –

  • model_name (str) –

  • rope_ratio (float) –

  • seq_length (int) –

  • qwen_type (str) –

  • rotary_base (int) –

  • partial_rotary_factor (float) –

  • original_max_position_embeddings (int) –

  • longrope_scaling_short_factors (List[float]) –

  • longrope_scaling_long_factors (List[float]) –

  • mup_attn_multiplier (float) –

  • mup_embedding_multiplier (float) –

  • mup_use_scaling (float) –

  • mup_width_multiplier (float) –

  • blocksparse_block_size (int) –

  • blocksparse_homo_head_pattern (bool) –

  • blocksparse_num_local_blocks (int) –

  • blocksparse_vertical_stride (int) –

  • dense_attention_every_n_layers (int) –

  • gegelu_limit (float) –

  • longrope_short_mscale (float) –

  • longrope_long_mscale (float) –

  • moe_num_experts (int) –

  • moe_top_k (int) –

  • moe_tp_mode (int) –

  • moe_renorm_mode (int) –

  • alibi_bias_max (int) –

  • residual_layernorm (LayernormConfig) –

  • residual_mlp (MLPConfig) –

  • rnn_hidden_size (int) –

  • logits_soft_cap (float) –

  • emb_scale_by_sqrt_dim (bool) –

  • layer_types (List[str]) –

  • final_logit_softcapping (float) –

  • attn_logit_softcapping (float) –

  • query_pre_attn_scalar (float) –

  • clip_qkv (int) –

  • cross_attention (AttentionConfig) –

  • cross_attention_layernorm (LayernormConfig) –

  • self_attention (AttentionConfig) –

  • self_attention_layernorm (LayernormConfig) –

  • attention_layernorm (LayernormConfig) –

  • rel_attn_max_distance (int) –

  • rel_attn_num_buckets (int) –

  • rope_scaling (dict) –

Return type:


alibi_bias_max: int = 0
apply_residual_connection_post_layernorm: bool = False
attention: AttentionConfig = None
attention_head_size: int = None
attention_layernorm: LayernormConfig = None
attn_logit_softcapping: float = 0
blocksparse_block_size: int = 0
blocksparse_homo_head_pattern: bool = False
blocksparse_num_local_blocks: int = 0
blocksparse_vertical_stride: int = 0
clip_qkv: int = 0
cross_attention: AttentionConfig = None
cross_attention_layernorm: LayernormConfig = None
decoder_type: str = ''
dense_attention_every_n_layers: int = 0
emb_scale_by_sqrt_dim: bool = False
property ffn_hidden_size_local

Returns the ffn hidden size of the transformer model.

final_logit_softcapping: float = 0
gegelu_limit: float = 0
property hidden_size

Returns the hidden size of the transformer model.

input_layernorm: LayernormConfig = None
layer_types: List[str]
logits_soft_cap: float = 0
longrope_long_mscale: float = 0
longrope_scaling_long_factors: List[float] = None
longrope_scaling_short_factors: List[float] = None
longrope_short_mscale: float = 0
max_position_embeddings: int = 0
mlp: MLPConfig | MOEConfig = None
mlp_layernorm: LayernormConfig = None
model_name: str = ''
moe_num_experts: int = 0
moe_renorm_mode: int = 0
moe_top_k: int = 0
moe_tp_mode: int = 0
mup_attn_multiplier: float = 0
mup_embedding_multiplier: float = 0
mup_use_scaling: float = 0
mup_width_multiplier: float = 0
new_decoder_architecture: bool = False
num_attention_heads: int = 0
num_kv_heads: int = 0
original_max_position_embeddings: int = 0
parallel_attention: bool = False
partial_rotary_factor: float = 0
post_feedforward_layernorm: LayernormConfig = None
post_layernorm: LayernormConfig = None
pre_feedforward_layernorm: LayernormConfig = None
quantization: str | None = None
query_pre_attn_scalar: float = 0
qwen_type: str = ''
recurrent: RecurrentConfig = None
rel_attn_max_distance: int = 0
rel_attn_num_buckets: int = 0
residual_layernorm: LayernormConfig = None
residual_mlp: MLPConfig = None
rnn_hidden_size: int = 0
rope_ratio: float = 1.0
rope_scaling: dict = None
rotary_base: int = 0
rotary_pct: float = 1.0
self_attention: AttentionConfig = None
self_attention_layernorm: LayernormConfig = None
seq_length: int = 0
use_alibi: bool = False
use_cache: bool = True
class EmbeddingConfig

Bases: object

The embedding layer config.


weight (Tensor) –

Return type:


property hidden_size

Infers the hidden_size from the embedding layer weights shape.

property local_vocab_size

Infers the vocab_size from the embedding layer weights shape.

weight: Tensor = None
class ExpertConfig

Bases: object

The Expert config.

__init__(fc=None, proj=None)
Return type:


fc: LinearConfig = None
proj: LinearConfig = None
class LayernormConfig

Bases: object

The layernorm layer config.

__init__(weight=None, bias=None, layernorm_type='', eps=1e-05)
  • weight (Tensor) –

  • bias (Tensor) –

  • layernorm_type (str) –

  • eps (float) –

Return type:


bias: Tensor = None
eps: float = 1e-05
layernorm_type: str = ''
weight: Tensor = None
class LinearActConfig

Bases: object

The linear + activation layer config.

__init__(linear=None, hidden_act='')
Return type:


hidden_act: str = ''
linear: LinearConfig = None
class LinearConfig

Bases: object

The linear layer config.

__init__(linear_type='column', weight=None, bias=None, activation_scaling_factor=None, weights_scaling_factor=None, weights_scaling_factor_2=None, prequant_scaling_factor=None, awq_block_size=0)
  • linear_type (str) –

  • weight (Tensor) –

  • bias (Tensor) –

  • activation_scaling_factor (Tensor) –

  • weights_scaling_factor (Tensor) –

  • weights_scaling_factor_2 (Tensor) –

  • prequant_scaling_factor (Tensor) –

  • awq_block_size (int) –

Return type:


activation_scaling_factor: Tensor = None
awq_block_size: int = 0
bias: Tensor = None
linear_type: str = 'column'
prequant_scaling_factor: Tensor = None
weight: Tensor = None
weights_scaling_factor: Tensor = None
weights_scaling_factor_2: Tensor = None
class MLPConfig

Bases: object

The MLP layer config.

__init__(fc=None, gate=None, proj=None, hidden_act='', merged_fc1_gate=False)
Return type:


fc: LinearConfig = None
gate: LinearConfig = None
hidden_act: str = ''
merged_fc1_gate: bool = False
proj: LinearConfig = None
class MOEConfig

Bases: object

The Mixture of Expert layer config.

__init__(router=None, experts=None, hidden_act='')
Return type:


experts: ExpertConfig = None
property fc

Return the fc module from experts.

hidden_act: str = ''
router: LinearConfig = None
class MedusaHeadConfig

Bases: object

The decoder layer config.

__init__(medusa_layers=None, lm_head=None)
Return type:


lm_head: LinearConfig = None
medusa_layers: List[LinearActConfig] = None
class ModelConfig

Bases: object

The full LLM model config that includes the full information needed for tensorrt_llm engine building.

This class includes all the fields that tensorrt_llm supports, but not all of the fields are required. pipeline_parallel > 1 is only supported for TensorRT-LLM checkpoint.

__init__(version=0.0, quantization=None, dtype='float16', vocab_size=0, rank=0, tensor_parallel=1, pipeline_parallel=1, vocab_embedding=None, position_embedding=None, block_embedding=None, ln_embed=None, layers=<factory>, ln_f=None, lm_head=None, share_embedding_table=False, medusa_heads=None, num_medusa_heads=0, num_medusa_layers=0, enc_dec='', encoder_hidden_size=0, encoder_num_heads=0, encoder_head_size=0)
  • version (float) –

  • quantization (str) –

  • dtype (str) –

  • vocab_size (int) –

  • rank (int) –

  • tensor_parallel (int) –

  • pipeline_parallel (int) –

  • vocab_embedding (EmbeddingConfig) –

  • position_embedding (EmbeddingConfig) –

  • block_embedding (EmbeddingConfig) –

  • ln_embed (LayernormConfig) –

  • layers (List[DecoderLayerConfig]) –

  • ln_f (LayernormConfig) –

  • lm_head (LinearConfig) –

  • share_embedding_table (bool) –

  • medusa_heads (List[MedusaHeadConfig]) –

  • num_medusa_heads (int) –

  • num_medusa_layers (int) –

  • enc_dec (str) –

  • encoder_hidden_size (int) –

  • encoder_num_heads (int) –

  • encoder_head_size (int) –

Return type:


block_embedding: EmbeddingConfig = None
dtype: str = 'float16'
enc_dec: str = ''
encoder_head_size: int = 0
encoder_hidden_size: int = 0
encoder_num_heads: int = 0
property hidden_act

Returns the hidden_act of the model.

property hidden_size

Returns the hidden_size of the model.

layers: List[DecoderLayerConfig]
lm_head: LinearConfig = None
ln_embed: LayernormConfig = None
ln_f: LayernormConfig = None
property max_position_embeddings

Returns the max_position_embedding of the model.

medusa_heads: List[MedusaHeadConfig] = None
property num_attention_heads

Returns the num_attention_heads of the model.

property num_kv_heads

Returns the num_key_value_heads of the model.

num_medusa_heads: int = 0
num_medusa_layers: int = 0
pipeline_parallel: int = 1
position_embedding: EmbeddingConfig = None
quantization: str = None
rank: int = 0
share_embedding_table: bool = False
tensor_parallel: int = 1
version: float = 0.0
vocab_embedding: EmbeddingConfig = None
vocab_size: int = 0
property vocab_size_padded

Returns the padded vocab_size of the model rounds to the tensor_parallel.

class QKVConfig

Bases: object

The QKV layer config.

__init__(q=None, k=None, v=None)
Return type:


property activation_scaling_factor

Returns the merged activation_scaling_factor across Q, K and V.

The max of the Q, K, V activation scaling factors is returned.

property awq_block_size

Returns the awq_block_size of this QKV layer.

property bias

The generated linear layer bias.

The Q, K, V bias are concat together to fit the TensorRT-LLM QKV linear layer.

k: LinearConfig = None
property prequant_scaling_factor

Returns the merged prequant_scaling_factor across Q, K and V.

Prequant scaling factors for Q, K, V should be the same. So just return one of them.

q: LinearConfig = None
v: LinearConfig = None
property weight

The generated linear layer weight.

The Q, K, V weights are concat together to fit the TensorRT-LLM QKV linear layer.

property weights_scaling_factor

Returns the merged weights_scaling_factor across Q, K and V.

If the quantization is FP8, the max of the Q, K, V weight scaling factors is returned. If the quanitzation is INT8_SQ, the concat value is returned.

property weights_scaling_factor_2

Returns the merged weights_scaling_factor_2 across Q, K and V.

weight_scaling_factor_2 is needed for W4A8 AWQ.

class RecurrentConfig

Bases: object

The RecurrentBlock from recurrentgemma.

__init__(linear_y=None, y_bias=None, linear_x=None, linear_out=None, conv1d=None, rg_lru=None)
Return type:


conv1d: ConvConfig = None
linear_out: LinearConfig = None
linear_x: LinearConfig = None
linear_y: LinearConfig = None
rg_lru: RgLruConfig = None
y_bias: Tensor = None
class RgLruConfig

Bases: object

The RG LRU from recurrentgemma.

__init__(recurrent_param=None, input_gate=None, recurrent_gate=None)
Return type:


input_gate: LinearConfig = None
recurrent_gate: LinearConfig = None
recurrent_param: Tensor = None