Layers#
Activation#
Attention#
- class tensorrt_llm.layers.attention.Attention(*, local_layer_idx, hidden_size, num_attention_heads, num_kv_heads=None, max_position_embeddings=1024, num_layers=1, apply_query_key_layer_scaling=False, attention_head_size=None, qk_layernorm=False, layernorm_type=LayerNormType.LayerNorm, layernorm_share=True, inner_layernorm=False, eps=1e-05, attention_mask_type=AttentionMaskType.padding, bias=True, dtype=None, position_embedding_type=PositionEmbeddingType.learned_absolute, rotary_embedding_base=10000.0, rotary_embedding_base_local=1.0, rotary_embedding_scaling=None, rotary_embedding_percentage=1.0, rope_scaling_short_factors=None, rope_scaling_long_factors=None, rope_scaling_short_mscale=None, rope_scaling_long_mscale=None, original_max_position_embeddings=1024, tp_group=None, tp_size=1, tp_rank=0, quant_mode: ~tensorrt_llm.quantization.mode.QuantMode = <QuantMode: 0>, q_scaling=1.0, cross_attention=False, relative_attention=False, max_distance=0, num_buckets=0, dense_bias=None, clip_qkv=None, alibi_bias_max=8, skip_cross_kv=False, max_attn_value=0.0, block_sparse_params=None, use_implicit_relative_attention=False, reorder=False, enable_qkv=True, cp_group=[0], cp_size=1, cp_rank=0, max_seqlen_for_logn_scaling=8192, use_logn_scaling=False, is_local=False)[source]#
Bases:
Module
- forward(
- hidden_states: Tensor,
- attention_mask=None,
- attention_packed_mask=None,
- use_cache=False,
- spec_decoding_params=None,
- mrope_params=None,
- kv_cache_params=None,
- attention_params=None,
- encoder_output: Tensor | None = None,
- position_embedding=None,
- norm_before_bmm1=False,
- lora_layer_params=None,
- cross_kv_cache_gen: Tensor | None = None,
- cross_kv_reuse: Tensor | None = None,
- all_reduce_params: AllReduceParams | None = None,
- skip_attn=None,
- class tensorrt_llm.layers.attention.AttentionMaskParams(
- self_attention_mask: Tensor = None,
- self_attention_packed_mask: Tensor = None,
- cross_attention_mask: Tensor = None,
- cross_attention_packed_mask: Tensor = None,
Bases:
object
- class tensorrt_llm.layers.attention.AttentionParams(
- sequence_length: Tensor = None,
- context_lengths: Tensor = None,
- host_context_lengths: Tensor = None,
- max_context_length: int = None,
- host_request_types: Tensor = None,
- encoder_input_lengths: Tensor = None,
- encoder_max_input_length: Tensor = None,
- host_runtime_perf_knobs: Tensor = None,
- host_context_progress: Tensor = None,
Bases:
object
- fill_attention_const_params_for_long_rope(
- embed_positions,
- long_rope_embed_positions,
- rotary_inv_freq,
- long_rope_rotary_inv_freq,
- embed_positions_for_gpt_attention,
- long_rope_embed_positions_for_gpt_attention,
- short_mscale,
- long_mscale,
- class tensorrt_llm.layers.attention.BertAttention(
- hidden_size,
- num_attention_heads,
- max_position_embeddings=1024,
- num_layers=1,
- attention_head_size=None,
- num_kv_heads=None,
- q_scaling=1.0,
- apply_query_key_layer_scaling=False,
- bias=True,
- dtype=None,
- tp_group=None,
- tp_size=1,
- tp_rank=0,
- cp_group=None,
- cp_size=1,
- relative_attention=False,
- max_distance=0,
- num_buckets=0,
- quant_mode=<QuantMode: 0>,
Bases:
Module
- class tensorrt_llm.layers.attention.BlockSparseAttnParams(
- block_size: int = 64,
- homo_head_pattern: bool = False,
- num_local_blocks: int = 16,
- vertical_stride: int = 8,
Bases:
object
- class tensorrt_llm.layers.attention.CogVLMAttention(
- *,
- local_layer_idx,
- hidden_size,
- num_attention_heads,
- num_kv_heads=None,
- max_position_embeddings=1024,
- attention_mask_type=AttentionMaskType.causal,
- bias=True,
- dtype=None,
- position_embedding_type=PositionEmbeddingType.learned_absolute,
- rotary_embedding_base=10000.0,
- rotary_embedding_scaling=None,
- tp_group=None,
- tp_size=1,
- tp_rank=0,
- quant_mode: ~tensorrt_llm.quantization.mode.QuantMode = <QuantMode: 0>,
- dense_bias=None,
Bases:
Attention
- class tensorrt_llm.layers.attention.DeepseekV2Attention(
- *,
- local_layer_idx,
- hidden_size,
- num_attention_heads,
- q_lora_rank,
- kv_lora_rank,
- qk_nope_head_dim=None,
- qk_rope_head_dim=None,
- v_head_dim=None,
- eps=1e-06,
- attention_mask_type=AttentionMaskType.causal,
- dtype=None,
- position_embedding_type=PositionEmbeddingType.learned_absolute,
- max_position_embeddings=1024,
- rotary_embedding_base=10000.0,
- rotary_embedding_scaling=None,
- rotary_embedding_beta_fast=32,
- rotary_embedding_beta_slow=1,
- rotary_embedding_mscale=1,
- rotary_embedding_mscale_all_dim=0,
- rotary_embedding_origin_max_position=4096,
- rotary_scaling=None,
- tp_group=None,
- tp_size=1,
- tp_rank=0,
- quant_mode: ~tensorrt_llm.quantization.mode.QuantMode = <QuantMode: 0>,
Bases:
Attention
- class tensorrt_llm.layers.attention.DiffusersAttention(
- *,
- query_dim: int,
- cross_attention_dim: int | None = None,
- heads: int = 8,
- kv_heads: int | None = None,
- dim_head: int = 64,
- dropout: float = 0.0,
- bias: bool = False,
- upcast_attention: bool = False,
- upcast_softmax: bool = False,
- cross_attention_norm: str | None = None,
- cross_attention_norm_num_groups: int = 32,
- qk_norm: str | None = None,
- added_kv_proj_dim: int | None = None,
- added_proj_bias: bool | None = True,
- norm_num_groups: int | None = None,
- spatial_norm_dim: int | None = None,
- out_bias: bool = True,
- scale_qk: bool = True,
- only_cross_attention: bool = False,
- eps: float = 1e-05,
- rescale_output_factor: float = 1.0,
- residual_connection: bool = False,
- out_dim: int = None,
- out_context_dim: int = None,
- context_pre_only=None,
- pre_only=False,
- elementwise_affine: bool = True,
- is_causal: bool = False,
- attn_forward_funcname: str = 'joint_attn_forward',
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.attention.KeyValueCacheParams(
- past_key_value: List[Tensor] = None,
- host_past_key_value_lengths: Tensor = None,
- host_max_attention_window_sizes: Tensor = None,
- host_sink_token_length: Tensor = None,
- kv_cache_block_offsets: Tensor = None,
- host_kv_cache_block_offsets: Tensor = None,
- host_kv_cache_pool_pointers: Tensor = None,
- host_kv_cache_pool_mapping: Tensor = None,
- cache_indirection: Tensor = None,
- past_key_value_length: Tensor = None,
- cross_kv_cache_block_offsets: Tensor = None,
- host_cross_kv_cache_block_offsets: Tensor = None,
- host_cross_kv_cache_pool_pointers: Tensor = None,
- host_cross_kv_cache_pool_mapping: Tensor = None,
Bases:
object
- class tensorrt_llm.layers.attention.SpecDecodingParams(
- spec_decoding_is_generation_length_variable: bool = False,
- spec_decoding_max_generation_length: int = 1,
- spec_decoding_generation_lengths: Tensor = None,
- spec_decoding_position_offsets: Tensor = None,
- spec_decoding_packed_mask: Tensor = None,
- spec_decoding_use: Tensor = None,
Bases:
object
Cast#
Conv#
- class tensorrt_llm.layers.conv.Conv1d(
- in_channels: int,
- out_channels: int,
- kernel_size: int,
- stride: int = 1,
- padding: int = 0,
- dilation: int = 1,
- groups: int = 1,
- bias: bool = True,
- padding_mode: str = 'zeros',
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.conv.Conv2d(
- in_channels: int,
- out_channels: int,
- kernel_size: Tuple[int, int],
- stride: Tuple[int, int] = (1, 1),
- padding: Tuple[int, int] = (0, 0),
- dilation: Tuple[int, int] = (1, 1),
- groups: int = 1,
- bias: bool = True,
- padding_mode: str = 'zeros',
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.conv.Conv3d(
- in_channels: int,
- out_channels: int,
- kernel_size: Tuple[int, int, int],
- stride: Tuple[int, int, int] = (1, 1, 1),
- padding: Tuple[int, int, int] = (0, 0, 0),
- dilation: Tuple[int, int, int] = (1, 1, 1),
- groups: int = 1,
- bias: bool = True,
- padding_mode: str = 'zeros',
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.conv.ConvTranspose2d(
- in_channels: int,
- out_channels: int,
- kernel_size: Tuple[int, int],
- stride: Tuple[int, int] = (1, 1),
- padding: Tuple[int, int] = (0, 0),
- output_padding: Tuple[int, int] = (0, 0),
- dilation: Tuple[int, int] = (1, 1),
- groups: int = 1,
- bias: bool = True,
- padding_mode: str = 'zeros',
- dtype=None,
Bases:
Module
Embedding#
- class tensorrt_llm.layers.embedding.CombinedTimestepLabelEmbeddings(
- num_classes,
- embedding_dim,
- class_dropout_prob=0.0,
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.embedding.CombinedTimestepTextProjEmbeddings(
- embedding_dim,
- pooled_projection_dim,
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.embedding.Embedding(
- num_embeddings: int,
- embedding_dim: int,
- dtype: str | None = None,
- tp_size: int = 1,
- tp_group: list | None = None,
- sharding_dim: int = 0,
- tp_rank: int | None = None,
Bases:
Module
The embedding layer takes input indices (x) and the embedding lookup table (weight) as input. And output the corresponding embeddings according to input indices. The size of weight is [num_embeddings, embedding_dim]
Four parameters (tp_size, tp_group, sharding_dim, tp_rank) are involved in tensor parallelism. Only when “tp_size > 1 and tp_group is not None”, tensor parallelism is enabled.
- When “sharding_dim == 0”, the weight is shared in the vocabulary dimension.
tp_rank must be set when sharding_dim == 0.
When “sharding_dim == 1”, the weight is shard in the hidden dimension.
- class tensorrt_llm.layers.embedding.LabelEmbedding(
- num_classes: int,
- hidden_size: int,
- dropout_prob: float = 0.0,
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.embedding.PixArtAlphaTextProjection(
- in_features,
- hidden_size,
- out_features=None,
- act_fn='gelu_tanh',
- mapping=None,
- dtype=None,
Bases:
Module
Projects caption embeddings. Also handles dropout for classifier-free guidance.
Adapted from PixArt-alpha/PixArt-alpha
- class tensorrt_llm.layers.embedding.PromptTuningEmbedding(
- num_embeddings,
- embedding_dim,
- vocab_size=None,
- dtype=None,
- tp_size=1,
- tp_group=None,
- sharding_dim=0,
- tp_rank=0,
Bases:
Embedding
PromptTuningEmbedding handles fine-tuned prompts with virtual tokens. At runtime, a supplementary embedding dictionary is passed. Tokens whose ids are >= vocab_size are embedded with that additional dictionary. The prompt tuning dictionary holds multiple tasks, and each sequence is assigned a given task. Prompt-tuned tokens from a given sequence use the adequate task dictionary, as defined by the tasks input.
- forward(
- tokens,
- prompt_embedding_table,
- tasks,
- task_vocab_size,
Pass all tokens through both normal and prompt embedding tables. Tokens are masked so that “normal” embedding only see “normal” tokens. Same logic for “prompt” embedding. After those two embedding, combine results based on whether the token was “normal” or “prompt-tuned”.
- Parameters:
tokens – Tensor the ids to embed, size [batch_size, seq_len]
prompt_embedding_table – Tensor the additional embedding table for prompt-tuned tokens, size [num_tasks * num_tokens_per_task, hidden_size]
tasks – Tensor the task required by each token, size [batch_size, seq_len]
task_vocab_size – Tensor the number of tokens used for each task, should be equal to prompt_embedding_table’s num_tokens_per_task, size [1]
- Returns:
Tokens’ embedding
- class tensorrt_llm.layers.embedding.SD3PatchEmbed(
- height: int = 224,
- width: int = 224,
- patch_size: int = 16,
- in_channels: int = 3,
- embed_dim: int = 768,
- layer_norm: bool = False,
- flatten: bool = True,
- bias: bool = True,
- interpolation_scale: int = 1,
- pos_embed_type: str = 'sincos',
- pos_embed_max_size: int | None = None,
- dtype=None,
Bases:
Module
2D Image to Patch Embedding with support for SD3 cropping.
- class tensorrt_llm.layers.embedding.TimestepEmbedding(
- in_channels: int,
- time_embed_dim: int,
- act_fn: str = 'silu',
- out_dim: int = None,
- post_act_fn: str | None = None,
- cond_proj_dim=None,
- sample_proj_bias=True,
- mapping=None,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.embedding.Timesteps(
- num_channels: int,
- flip_sin_to_cos: bool,
- downscale_freq_shift: float,
- scale: int = 1,
Bases:
Module
- tensorrt_llm.layers.embedding.get_1d_sincos_pos_embed_from_grid(
- embed_dim: int,
- pos: Tensor,
- tensorrt_llm.layers.embedding.get_2d_sincos_pos_embed(
- embed_dim: int,
- grid_size: int | Sequence[int],
- cls_token: bool = False,
- extra_tokens: int = 0,
- interpolation_scale: float = 1.0,
- base_size: int = 16,
- tensorrt_llm.layers.embedding.get_2d_sincos_pos_embed_from_grid(
- embed_dim: int,
- grid: Sequence[Tensor],
- tensorrt_llm.layers.embedding.get_timestep_embedding(
- timesteps: Tensor,
- embedding_dim: int,
- flip_sin_to_cos: bool = False,
- downscale_freq_shift: float = 1,
- scale: float = 1,
- max_period: int = 10000,
This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
- Args
- timesteps (Tensor):
a 1-D Tensor of N indices, one per batch element. These may be fractional.
- embedding_dim (int):
the dimension of the output.
- flip_sin_to_cos (bool):
Whether the embedding order should be cos, sin (if True) or sin, cos (if False)
- downscale_freq_shift (float):
Controls the delta between frequencies between dimensions
- scale (float):
Scaling factor applied to the embeddings.
- max_period (int):
Controls the maximum frequency of the embeddings
- Returns
Tensor: an [N x dim] Tensor of positional embeddings.
Linear#
- class tensorrt_llm.layers.linear.Linear(
- in_features,
- out_features,
- bias=True,
- dtype=None,
- tp_group=None,
- tp_size=1,
- gather_output=True,
- share_weight=None,
- strict_dtype=False,
- pad_lda=0,
- pad_ldc=0,
- prefer_managed_weight=True,
- is_qkv=False,
Bases:
LinearBase
- class tensorrt_llm.layers.linear.LinearBase(
- local_in_features,
- local_out_features,
- bias=True,
- dtype=None,
- tp_group=None,
- tp_size=1,
- share_weight=None,
- strict_dtype=False,
- pad_lda=0,
- pad_ldc=0,
- prefer_managed_weight=True,
Bases:
Module
- forward(
- x,
- lora_runtime_params: LoraRuntimeParams | None = None,
- lora_hidden_state: Tensor | None = None,
- **kwargs,
- multiply_and_lora(
- x,
- weight,
- gemm_plugin: str | None = None,
- low_latency_gemm_plugin: str | None = None,
- use_fp8: bool = False,
- alpha: ndarray | None = None,
- lora_runtime_params: LoraRuntimeParams | None = None,
- lora_hidden_state: Tensor | None = None,
- class tensorrt_llm.layers.linear.RowLinear(
- in_features,
- out_features,
- bias=True,
- dtype=None,
- tp_group=None,
- tp_size=1,
- strict_dtype: bool = False,
- pad_lda=0,
- prefer_managed_weight=True,
- is_expert=False,
Bases:
LinearBase
MLP#
- class tensorrt_llm.layers.mlp.FusedGatedMLP(
- hidden_size,
- ffn_hidden_size,
- hidden_act,
- bias=True,
- dtype=None,
- tp_group=None,
- tp_size=1,
- quant_mode=<QuantMode: 0>,
- inner_layernorm=False,
- eps=1e-05,
- is_expert=False,
Bases:
Module
- forward(
- hidden_states,
- lora_layer_params=None,
- all_reduce_params: AllReduceParams | None = None,
- class tensorrt_llm.layers.mlp.GatedMLP(
- hidden_size,
- ffn_hidden_size,
- hidden_act,
- bias=True,
- dtype=None,
- tp_group=None,
- tp_size=1,
- quant_mode=<QuantMode: 0>,
- inner_layernorm=False,
- eps=1e-05,
- is_expert=False,
Bases:
MLP
- forward(
- hidden_states,
- lora_layer_params=None,
- all_reduce_params: AllReduceParams | None = None,
- class tensorrt_llm.layers.mlp.LinearActivation(
- dim_in: int,
- dim_out: int,
- bias: bool = True,
- activation: str = 'silu',
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.mlp.LinearApproximateGELU(
- dim_in: int,
- dim_out: int,
- bias: bool = True,
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.mlp.LinearGEGLU(
- dim_in: int,
- dim_out: int,
- approximate: str = 'tanh',
- bias: bool = True,
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.mlp.LinearGELU(
- dim_in: int,
- dim_out: int,
- approximate: str = 'tanh',
- bias: bool = True,
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.mlp.LinearSwiGLU(
- dim_in: int,
- dim_out: int,
- bias: bool = True,
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.mlp.MLP(
- hidden_size,
- ffn_hidden_size,
- hidden_act,
- bias=True,
- dtype=None,
- tp_group=None,
- tp_size=1,
- quant_mode=<QuantMode: 0>,
- inner_layernorm=False,
- eps=1e-05,
- is_expert=False,
Bases:
Module
Normalization#
- class tensorrt_llm.layers.normalization.AdaLayerNorm(
- embedding_dim: int,
- num_embeddings: int | None = None,
- output_dim: int | None = None,
- norm_elementwise_affine: bool = False,
- norm_eps: float = 1e-05,
- chunk_dim: int = 0,
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.normalization.AdaLayerNormContinuous(
- embedding_dim: int,
- conditioning_embedding_dim: int,
- elementwise_affine: bool = True,
- eps: float = 1e-05,
- bias: bool = True,
- norm_type: str = 'layer_norm',
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.normalization.AdaLayerNormZero(
- embedding_dim: int,
- num_embeddings: int | None = None,
- norm_type: str = 'layer_norm',
- bias: bool = True,
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.normalization.AdaLayerNormZeroSingle(
- embedding_dim: int,
- norm_type: str = 'layer_norm',
- bias: bool = True,
- mapping=<tensorrt_llm.mapping.Mapping object>,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.normalization.GroupNorm(
- num_groups,
- num_channels,
- eps=1e-05,
- affine=True,
- dtype=None,
Bases:
Module
- class tensorrt_llm.layers.normalization.LayerNorm(
- normalized_shape,
- eps=1e-05,
- elementwise_affine=True,
- bias=True,
- dtype=None,
- tp_size=1,
- tp_dim=-1,
Bases:
Module