Layers

Activation

class tensorrt_llm.layers.activation.Mish[source]

Bases: Module

forward(input)[source]

Attention

class tensorrt_llm.layers.attention.Attention(*, local_layer_idx, hidden_size, num_attention_heads, num_kv_heads=None, max_position_embeddings=1024, num_layers=1, apply_query_key_layer_scaling=False, attention_head_size=None, qk_layernorm=False, layernorm_type=LayerNormType.LayerNorm, layernorm_share=True, inner_layernorm=False, eps=1e-05, attention_mask_type=AttentionMaskType.padding, bias=True, dtype=None, position_embedding_type=PositionEmbeddingType.learned_absolute, rotary_embedding_base=10000.0, rotary_embedding_scaling=None, rotary_embedding_percentage=1.0, rope_scaling_short_factors=None, rope_scaling_long_factors=None, rope_scaling_short_mscale=None, rope_scaling_long_mscale=None, original_max_position_embeddings=1024, tp_group=None, tp_size=1, tp_rank=0, quant_mode: ~tensorrt_llm.quantization.mode.QuantMode = QuantMode.None, q_scaling=1.0, cross_attention=False, relative_attention=False, max_distance=0, num_buckets=0, dense_bias=None, clip_qkv=None, alibi_bias_max=8, skip_cross_kv=False, max_attn_value=0.0, block_sparse_params=None, use_implicit_relative_attention=False, reorder=False, layer_idx_in_cache_pool=None)[source]

Bases: Module

static create_attention_const_params(model_cls, config)[source]
static fill_attention_params(model_cls, attention_params)[source]
forward(hidden_states: Tensor, attention_mask=None, attention_packed_mask=None, use_cache=False, spec_decoding_params=None, kv_cache_params=None, attention_params=None, encoder_output: Tensor | None = None, position_embedding=None, norm_before_bmm1=False, lora_layer_params=None, cross_kv_cache_gen: Tensor | None = None, cross_kv_reuse: Tensor | None = None, reduce_fusion_params: AllReduceFusionParams | None = None)[source]
postprocess(tllm_key, weights, **kwargs)[source]
set_rel_attn_table(max_seq_len, precomputed_relative_attention)[source]
class tensorrt_llm.layers.attention.AttentionMaskParams(self_attention_mask: Tensor | None = None, self_attention_packed_mask: Tensor | None = None, cross_attention_mask: Tensor | None = None, cross_attention_packed_mask: Tensor | None = None)[source]

Bases: object

class tensorrt_llm.layers.attention.AttentionParams(sequence_length: Tensor | None = None, context_lengths: Tensor | None = None, host_context_lengths: Tensor | None = None, max_context_length: int | None = None, host_request_types: Tensor | None = None, encoder_input_lengths: Tensor | None = None, encoder_max_input_length: Tensor | None = None, host_runtime_perf_knobs: Tensor | None = None, host_context_progress: Tensor | None = None)[source]

Bases: object

fill_attention_const_params_for_long_rope(embed_positions_short_factors, embed_positions_long_factors, embed_positions_short_factors_for_attention_plugin, embed_positions_long_factors_for_attention_plugin, short_mscale, long_mscale, short_inv_freq, long_inv_freq)[source]
fill_attention_const_params_for_rope(embed_positions: Tensor | None = None, rotary_inv_freq: Tensor | None = None, embed_positions_for_gpt_attention: Tensor | None = None)[source]
is_valid(gpt_attention_plugin, remove_input_padding, use_kv_cache)[source]
is_valid_cross_attn(do_cross_attention)[source]
class tensorrt_llm.layers.attention.BertAttention(hidden_size, num_attention_heads, max_position_embeddings=1024, num_layers=1, attention_head_size=None, num_kv_heads=None, q_scaling=1.0, apply_query_key_layer_scaling=False, bias=True, dtype=None, tp_group=None, tp_size=1, tp_rank=0, cp_group=None, cp_size=1, relative_attention=False, max_distance=0, num_buckets=0, quant_mode=QuantMode.None)[source]

Bases: Module

forward(hidden_states: Tensor, attention_mask=None, input_lengths=None, max_input_length=None, lora_layer_params=None)[source]
class tensorrt_llm.layers.attention.BlockSparseAttnParams(block_size: int = 64, homo_head_pattern: bool = False, num_local_blocks: int = 16, vertical_stride: int = 8)[source]

Bases: object

class tensorrt_llm.layers.attention.CogVLMAttention(*, local_layer_idx, hidden_size, num_attention_heads, num_kv_heads=None, max_position_embeddings=1024, attention_mask_type=AttentionMaskType.causal, bias=True, dtype=None, position_embedding_type=PositionEmbeddingType.learned_absolute, rotary_embedding_base=10000.0, rotary_embedding_scaling=None, tp_group=None, tp_size=1, tp_rank=0, quant_mode: ~tensorrt_llm.quantization.mode.QuantMode = QuantMode.None, dense_bias=None)[source]

Bases: Attention

forward(hidden_states: Tensor, use_cache=False, kv_cache_params=None, attention_params=None, vision_token_mask=None, position_embedding=None)[source]
class tensorrt_llm.layers.attention.KeyValueCacheParams(past_key_value: List[Tensor] | None = None, host_past_key_value_lengths: Tensor | None = None, host_max_attention_window_sizes: Tensor | None = None, host_sink_token_length: Tensor | None = None, kv_cache_block_offsets: Tensor | None = None, host_kv_cache_block_offsets: Tensor | None = None, host_kv_cache_pool_pointers: Tensor | None = None, host_kv_cache_pool_mapping: Tensor | None = None, cache_indirection: Tensor | None = None, past_key_value_length: Tensor | None = None, cross_kv_cache_block_offsets: Tensor | None = None, host_cross_kv_cache_block_offsets: Tensor | None = None, host_cross_kv_cache_pool_pointers: Tensor | None = None, host_cross_kv_cache_pool_mapping: Tensor | None = None)[source]

Bases: object

fill_none_tensor_list(list_size)[source]
get_first_past_key_value()[source]
is_valid(gpt_attention_plugin)[source]
class tensorrt_llm.layers.attention.SpecDecodingParams(spec_decoding_is_generation_length_variable: bool = False, spec_decoding_max_generation_length: int = 1, spec_decoding_generation_lengths: Tensor | None = None, spec_decoding_position_offsets: Tensor | None = None, spec_decoding_packed_mask: Tensor | None = None)[source]

Bases: object

tensorrt_llm.layers.attention.compute_relative_bias(query_length, key_length, num_buckets, max_distance, bidirectional, rel_attn_table, tp_size=1, tp_group=None, tp_rank=None)[source]
tensorrt_llm.layers.attention.make_causal_mask(bsz, tgt_len, past_key_values_length, dtype)[source]

Cast

class tensorrt_llm.layers.cast.Cast(output_dtype: str = 'float32')[source]

Bases: Module

forward(x)[source]

Conv

class tensorrt_llm.layers.conv.Conv1d(in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, padding: int = 0, dilation: int = 1, groups: int = 1, bias: bool = True, padding_mode: str = 'zeros', dtype=None)[source]

Bases: Module

forward(input)[source]
class tensorrt_llm.layers.conv.Conv2d(in_channels: int, out_channels: int, kernel_size: Tuple[int, int], stride: Tuple[int, int] = (1, 1), padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1, bias: bool = True, padding_mode: str = 'zeros', dtype=None)[source]

Bases: Module

forward(input)[source]
class tensorrt_llm.layers.conv.ConvTranspose2d(in_channels: int, out_channels: int, kernel_size: Tuple[int, int], stride: Tuple[int, int] = (1, 1), padding: Tuple[int, int] = (0, 0), output_padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1, bias: bool = True, padding_mode: str = 'zeros', dtype=None)[source]

Bases: Module

forward(input, output_size=None)[source]

Embedding

class tensorrt_llm.layers.embedding.Embedding(num_embeddings: int, embedding_dim: int, dtype: str | None = None, tp_size: int = 1, tp_group: list | None = None, sharding_dim: int = 0, tp_rank: int | None = None)[source]

Bases: Module

The embedding layer takes input indices (x) and the embedding lookup table (weight) as input. And output the corresponding embeddings according to input indices. The size of weight is [num_embeddings, embedding_dim]

Four parameters (tp_size, tp_group, sharding_dim, tp_rank) are involved in tensor parallelism. Only when “tp_size > 1 and tp_group is not None”, tensor parallelism is enabled.

When “sharding_dim == 0”, the weight is shared in the vocabulary dimension.

tp_rank must be set when sharding_dim == 0.

When “sharding_dim == 1”, the weight is shard in the hidden dimension.

forward(x)[source]
postprocess(tllm_key, weights, **kwargs)[source]
weight_loader(mapping: Mapping, param: Parameter, loaded_weight: Tensor)[source]
class tensorrt_llm.layers.embedding.PromptTuningEmbedding(num_embeddings, embedding_dim, vocab_size=None, dtype=None, tp_size=1, tp_group=None, sharding_dim=0, tp_rank=0)[source]

Bases: Embedding

PromptTuningEmbedding handles fine-tuned prompts with virtual tokens. At runtime, a supplementary embedding dictionary is passed. Tokens whose ids are >= vocab_size are embedded with that additional dictionary. The prompt tuning dictionary holds multiple tasks, and each sequence is assigned a given task. Prompt-tuned tokens from a given sequence use the adequate task dictionary, as defined by the tasks input.

forward(tokens, prompt_embedding_table, tasks, task_vocab_size)[source]

Pass all tokens through both normal and prompt embedding tables. Tokens are masked so that “normal” embedding only see “normal” tokens. Same logic for “prompt” embedding. After those two embedding, combine results based on whether the token was “normal” or “prompt-tuned”.

Parameters:
  • tokens – Tensor the ids to embed, size [batch_size, seq_len]

  • prompt_embedding_table – Tensor the additional embedding table for prompt-tuned tokens, size [num_tasks * num_tokens_per_task, hidden_size]

  • tasks – Tensor the task required by each token, size [batch_size, seq_len]

  • task_vocab_size – Tensor the number of tokens used for each task, should be equal to prompt_embedding_table’s num_tokens_per_task, size [1]

Returns:

Tokens’ embedding

Linear

tensorrt_llm.layers.linear.ColumnLinear

alias of Linear

class tensorrt_llm.layers.linear.Linear(in_features, out_features, bias=True, dtype=None, tp_group=None, tp_size=1, gather_output=True, share_weight=None, strict_dtype=False, pad_lda=0, prefer_managed_weight=True, is_qkv=False)[source]

Bases: LinearBase

collect_and_bias(x, **kwargs)[source]
postprocess(tllm_key, weights, **kwargs)[source]
classmethod tp_split_dim() int[source]
class tensorrt_llm.layers.linear.LinearBase(local_in_features, local_out_features, bias=True, dtype=None, tp_group=None, tp_size=1, share_weight=None, strict_dtype=False, pad_lda=0, prefer_managed_weight=True)[source]

Bases: Module

abstract collect_and_bias(x: Tensor) Tensor[source]
forward(x, lora_runtime_params: LoraRuntimeParams | None = None, lora_hidden_state: Tensor | None = None, **kwargs) Tensor[source]
get_weight() Tensor[source]
multiply_and_lora(x, weight, gemm_plugin: str | None = None, low_latency_gemm_plugin: str | None = None, use_fp8: bool = False, alpha: ndarray | None = None, lora_runtime_params: LoraRuntimeParams | None = None, lora_hidden_state: Tensor | None = None)[source]
multiply_collect(x, weight, gemm_plugin: str | None = None, low_latency_gemm_plugin: str | None = None, use_fp8: bool = False, alpha: ndarray | None = None, lora_runtime_params: LoraRuntimeParams | None = None, lora_hidden_state: Tensor | None = None, **kwargs)[source]
abstract classmethod tp_split_dim() int[source]
weight_is_kn()[source]
weight_loader(mapping: Mapping, param: Parameter, loaded_weight: Tensor) None[source]
class tensorrt_llm.layers.linear.RowLinear(in_features, out_features, bias=True, dtype=None, tp_group=None, tp_size=1, strict_dtype: bool = False, pad_lda=0, prefer_managed_weight=True, is_expert=False)[source]

Bases: LinearBase

collect_and_bias(x, **kwargs)[source]
classmethod tp_split_dim() int[source]

MLP

class tensorrt_llm.layers.mlp.FusedGatedMLP(hidden_size, ffn_hidden_size, hidden_act, bias=True, dtype=None, tp_group=None, tp_size=1, quant_mode=QuantMode.None, inner_layernorm=False, eps=1e-05, is_expert=False)[source]

Bases: Module

fc_gate(hidden_states, lora_layer_params=None)[source]
fc_gate_plugin(hidden_states, lora_layer_params=None)[source]
forward(hidden_states, lora_layer_params=None, reduce_fusion_params: AllReduceFusionParams | None = None)[source]
class tensorrt_llm.layers.mlp.GatedMLP(hidden_size, ffn_hidden_size, hidden_act, bias=True, dtype=None, tp_group=None, tp_size=1, quant_mode=QuantMode.None, inner_layernorm=False, eps=1e-05, is_expert=False)[source]

Bases: MLP

forward(hidden_states, lora_layer_params=None, reduce_fusion_params: AllReduceFusionParams | None = None)[source]
class tensorrt_llm.layers.mlp.MLP(hidden_size, ffn_hidden_size, hidden_act, bias=True, dtype=None, tp_group=None, tp_size=1, quant_mode=QuantMode.None, inner_layernorm=False, eps=1e-05, is_expert=False)[source]

Bases: Module

forward(hidden_states, lora_layer_params=None, gegelu_limit=None)[source]
tensorrt_llm.layers.mlp.fc_gate_lora(hidden_states, lora, lora_layer_params)[source]

Normalization

class tensorrt_llm.layers.normalization.GroupNorm(num_groups, num_channels, eps=1e-05, affine=True, dtype=None)[source]

Bases: Module

forward(x)[source]
class tensorrt_llm.layers.normalization.LayerNorm(normalized_shape, eps=1e-05, elementwise_affine=True, bias=True, dtype=None, tp_size=1, tp_dim=-1)[source]

Bases: Module

forward(x, normalized_shape=None)[source]
class tensorrt_llm.layers.normalization.RmsNorm(normalized_shape, num_groups=1, eps=1e-06, elementwise_affine=True, dtype=None)[source]

Bases: Module

forward(x, normalized_shape=None)[source]

Pooling

class tensorrt_llm.layers.pooling.AvgPool2d(kernel_size: Tuple[int], stride: Tuple[int] | None = None, padding: Tuple[int] | None = (0, 0), ceil_mode: bool = False, count_include_pad: bool = True)[source]

Bases: Module

forward(input)[source]