Binding Names#
-
namespace trt_edgellm#
-
namespace binding_names#
Core LLM Input/Output Bindings
-
char const *kInputsEmbeds = "inputs_embeds"#
Input embeddings tensor - contains the embedded input sequence.
Shape: [batch_size, sequence_length, hidden_size] (FLOAT16)
-
char const *kContextLengths = "context_lengths"#
Context lengths tensor - specifies the actual length of each sequence in the batch.
Shape: [batch_size] (INT32)
-
char const *kLastTokenIds = "last_token_ids"#
Last token IDs tensor - indices of the last tokens to extract from hidden states.
Shape: [batch_size] for Eagle models, [batch_size, 1] for vanilla models (INT64)
-
char const *kLogits = "logits"#
Output logits tensor - probability distribution over vocabulary.
Shape: [batch_size, vocab_size] or [select_tokens, vocab_size] (FLOAT32)
-
char const *kOutputHiddenStates = "hidden_states"#
Output hidden states tensor - intermediate representations for speculative decoding.
Shape: [batch_size, sequence_length, hidden_dim] (FLOAT16)
Positional Encoding Bindings
-
char const *kRopeCosSin = "rope_rotary_cos_sin"#
Rotary positional encoding cos/sin cache tensor.
Shape: [batch_size, max_seq_len, rotary_dim] (FLOAT32)
KV Cache Bindings
-
char const *kKVCacheStartIndex = "kvcache_start_index"#
KV cache start index tensor - starting position for KV cache reuse.
Shape: [batch_size] (INT32)
-
char const *kPastKeyValuesTemplate = "past_key_values"#
Past key-value cache tensor template - use with layer index formatting.
Template: “past_key_values_{layer_idx}” Shape: [batch_size, 2, num_kv_heads, seq_len, head_dim] (FLOAT16)
-
char const *kPresentKeyValuesTemplate = "present_key_values"#
Present key-value cache tensor template - use with layer index formatting.
Template: “present_key_values_{layer_idx}” Shape: [batch_size, 2, num_kv_heads, seq_len, head_dim] (FLOAT16)
-
char const *kKCacheTemplate = "k_cache"#
K cache tensor template for TensorRT native KVCacehUpdate operations - use with layer index formatting.
Template: “k_cache_{layer_idx}” Shape: [batch_size, num_kv_heads, seq_len, head_dim] (FLOAT16)
-
char const *kVCacheTemplate = "v_cache"#
V cache tensor template for TensorRT native KVCacheUpdate operations - use with layer index formatting.
Template: “v_cache_{layer_idx}” Shape: [batch_size, num_kv_heads, seq_len, head_dim] (FLOAT16)
-
char const *kPresentKCacheTemplate = "present_k_cache"#
Present K cache tensor template for TensorRT native KVCacheUpdate operations - use with layer index formatting.
Template: “present_k_cache_{layer_idx}” Shape: [batch_size, num_kv_heads, seq_len, head_dim] (FLOAT16)
-
char const *kPresentVCacheTemplate = "present_v_cache"#
Present V cache tensor template for TensorRT native KVCacheUpdate operations - use with layer index formatting.
Template: “present_v_cache_{layer_idx}” Shape: [batch_size, num_kv_heads, seq_len, head_dim] (FLOAT16)
SSM (Mamba) State Bindings
-
char const *kSSMStateTemplate = "ssm_state"#
Past SSM state tensor template for Mamba layers.
Template: “ssm_state_{mamba_layer_idx}” Shape: [batch_size, mamba_num_heads, mamba_head_dim, ssm_state_size] (FLOAT16)
-
char const *kPresentSSMStateTemplate = "present_ssm_state"#
Present SSM state tensor template for Mamba layers.
Template: “present_ssm_state_{mamba_layer_idx}” Shape: [batch_size, mamba_num_heads, mamba_head_dim, ssm_state_size] (FLOAT16)
-
char const *kConvStateTemplate = "conv_state"#
Past conv state tensor template for Mamba layers.
Template: “conv_state_{mamba_layer_idx}” Shape: [batch_size, conv_dim, conv_kernel_size] (FLOAT16)
-
char const *kPresentConvStateTemplate = "present_conv_state"#
Present conv state tensor template for Mamba layers.
Template: “present_conv_state_{mamba_layer_idx}” Shape: [batch_size, conv_dim, conv_kernel_size] (FLOAT16)
Eagle Speculative Decoding Bindings
-
char const *kBaseModelHiddenStates = "hidden_states_input"#
Base model hidden states input for Eagle draft models.
Shape: [batch_size, sequence_length, base_hidden_dim] (FLOAT16)
-
char const *kDraftModelHiddenStates = "hidden_states_from_draft"#
Draft model hidden states input for Eagle draft models.
Shape: [batch_size, sequence_length, draft_hidden_dim] (FLOAT16)
-
char const *kAttentionMask = "attention_mask"#
Attention mask for Eagle models - packed tree attention mask.
Shape: [batch_size, tree_size, packed_mask_len] (INT32 for base, INT8 for draft)
-
char const *kAttentionPosId = "attention_pos_id"#
Attention position IDs for Eagle models.
Shape: [batch_size, tree_size] (INT32)
Visual Encoder Bindings (Qwen-VL, InternVL)
-
char const *kVisualInput = "input"#
Visual input tensor for vision transformers.
Shape: [sequence_length, input_dim] for Qwen-VL, [num_blocks, channels, height, width] for InternVL
-
char const *kVisualOutput = "output"#
Visual output tensor from vision transformers.
Shape: [num_image_tokens, hidden_size] (FLOAT16)
-
char const *kRotaryPosEmb = "rotary_pos_emb"#
Rotary positional embeddings for visual inputs (Qwen-VL specific)
Shape: [sequence_length, embed_dim] (FLOAT32)
-
char const *kCuSeqlens = "cu_seqlens"#
Cumulative sequence lengths for ragged ViT attention.
Shape: [num_images + 1] (INT32)
-
char const *kMaxSeqLenCarrier = "max_seqlen_carrier"#
Shape-only input used to convey runtime max sequence-length for FMHA launch.
Shape: [max_seqlen] (INT32)
-
char const *kCuWindowSeqlens = "cu_window_seqlens"#
Cumulative window sequence lengths for Qwen2.5-VL window attention.
Shape: [num_windows + 1] (INT32)
-
char const *kWindowIndex = "window_index"#
Window index for Qwen2.5-VL sliding window attention.
Shape: [num_windows] (INT64)
-
char const *kReverseWindowIndex = "reverse_window_index"#
Reverse window index for Qwen2.5-VL sliding window attention.
Shape: [num_windows] (INT64)
-
char const *kFastPosEmbIdx = "fast_pos_embed_idx"#
Fast position embeddings index tensor for Qwen3-VL vision model.
Shape: [4, sequence_length] (INT64)
-
char const *kFastPosEmbWeight = "fast_pos_embed_weight"#
Fast position embeddings weight tensor for Qwen3-VL vision model.
Shape: [4, sequence_length] (FLOAT16)
-
char const *kDeepstackFeaturesTemplate = "deepstack_features"#
Deepstack features tensor for Qwen3-VL vision model (visual encoder output)
Shape: [num_image_tokens, hidden_size] (FLOAT16)
-
char const *kDeepstackEmbedsTemplate = "deepstack_embeds"#
Deepstack embeddings tensor template for Qwen3-VL text model (LLM input)
Template: “deepstack_embeds_{layer_idx}” where layer_idx is 0, 1, or 2 Shape: [batch_size, sequence_length, hidden_size] (FLOAT16)
Vocabulary Mapping Configuration
-
char const *kReducedVocabSizeKey = "reduced_vocab_size"#
JSON configuration key for reduced vocabulary size.
Used to check if the model uses vocabulary reduction optimization
-
char const *kVocabMapFileName = "vocab_map.safetensors"#
Vocabulary mapping file name.
SafeTensors file containing mapping between full and reduced vocabulary
Audio Encoder Bindings (Qwen3-Omni)
-
char const *kAudioPaddedFeatures = "padded_feature"#
Audio padded features tensor - chunked and padded Mel-spectrogram.
Shape: [num_chunks, mel_bins, max_chunk_len] (FLOAT16)
-
char const *kAudioPaddedMaskIndices = "padded_mask_after_cnn_indices"#
Audio padded mask indices - nonzero indices from mask.
Shape: [num_valid_elements, 2] (INT64) Each row is [chunk_idx, position_idx] indicating valid positions after CNN downsampling
-
char const *kAudioAttentionMask = "attention_mask"#
Audio attention mask - block-diagonal mask for chunk-wise attention.
Shape: [num_attention_elems, num_attention_elems] (FLOAT16) Block-diagonal matrix where each block corresponds to one audio chunk
-
char const *kAudioOutput = "last_hidden_state"#
Audio encoder output - audio embeddings.
Shape: [num_audio_tokens, hidden_size] (FLOAT16)
CodePredictor Bindings (Qwen3-Omni)
-
char const *kLmHeadWeight = "lm_head_weight"#
LM head weight tensor - dynamically bound weight for CodePredictor.
Shape: [vocab_size, hidden_size] (FLOAT16) This is used for dynamic lm_head selection in CodePredictor (15 different heads for RVQ layers)
Code2Wav Vocoder Bindings (Qwen3-Omni)
-
char const *kCode2WavCodes = "codes"#
Code2Wav input codes tensor - RVQ codec codes for vocoder.
Shape: [batch_size, num_quantizers, sequence_length] (INT32) num_quantizers: 15 for Qwen3-Omni
-
char const *kCode2WavWaveform = "waveform"#
Code2Wav output waveform tensor - generated audio waveform.
Shape: [batch_size, 1, waveform_length] (FLOAT32) Values in range [-1.0, 1.0]
LoRA (Low-Rank Adaptation) Bindings
-
char const *kLoraAPrefix = "lora_A"#
LoRA A weight matrix prefix - use with layer/component specific suffixes.
Template: “lora_A_{component}_{layer}” Shape: [gemm_k, lora_rank] (FLOAT16)
-
char const *kLoraBPrefix = "lora_B"#
LoRA B weight matrix prefix - use with layer/component specific suffixes.
Template: “lora_B_{component}_{layer}” Shape: [lora_rank, gemm_n] (FLOAT16)
-
char const *kEdgellmVersion = "edgellm_version"#
EDGELLM version.
Value: “major.minor.patch.build” Example: “0.5.0.0”
Utility Functions
- inline std::string formatKVCacheName(
- int32_t layerIdx,
- bool isPast = true
Format KV cache binding name for a specific layer.
- Parameters:
layerIdx – The decoder layer index
isPast – Whether this is past (true) or present (false) key-values
- Returns:
Formatted binding name like “past_key_values_0” or “present_key_values_0”
- inline std::string formatKCacheName(
- int32_t layerIdx,
- bool isPast = true
Format K cache binding name for a specific layer (TensorRT native operations)
- Parameters:
layerIdx – The decoder layer index
isPast – Whether this is past (true) or present (false) K cache
- Returns:
Formatted binding name like “k_cache_0” or “present_k_cache_0”
- inline std::string formatVCacheName(
- int32_t layerIdx,
- bool isPast = true
Format V cache binding name for a specific layer (TensorRT native operations)
- Parameters:
layerIdx – The decoder layer index
isPast – Whether this is past (true) or present (false) V cache
- Returns:
Formatted binding name like “v_cache_0” or “present_v_cache_0”
- inline std::string formatSSMStateName(
- int32_t mambaLayerIdx,
- bool isPast = true
Format SSM state binding name for a specific Mamba layer.
- Parameters:
mambaLayerIdx – The Mamba layer index (0-based, only counting Mamba layers)
isPast – Whether this is past (true) or present (false) SSM state
- Returns:
Formatted binding name like “ssm_state_0” or “present_ssm_state_0”
-
inline bool isSSMStateBinding(std::string const &bindingName)#
Check if a binding name is an SSM state tensor.
- Parameters:
bindingName – The tensor binding name to check
- Returns:
True if the binding is an SSM state tensor
- inline std::string formatConvStateName(
- int32_t mambaLayerIdx,
- bool isPast = true
Format conv state binding name for a specific Mamba layer.
- Parameters:
mambaLayerIdx – The Mamba layer index (0-based, only counting Mamba layers)
isPast – Whether this is past (true) or present (false) conv state
- Returns:
Formatted binding name like “conv_state_0” or “present_conv_state_0”
-
inline bool isConvStateBinding(std::string const &bindingName)#
Check if a binding name is a conv state tensor.
- Parameters:
bindingName – The tensor binding name to check
- Returns:
True if the binding is a conv state tensor
-
inline bool isLoraBinding(std::string const &bindingName) noexcept#
Check if a binding name is a LoRA weight tensor.
- Parameters:
bindingName – The tensor binding name to check
- Returns:
True if the binding is a LoRA weight tensor
-
inline bool isKVCacheBinding(std::string const &bindingName) noexcept#
Check if a binding name is a KV cache tensor.
- Parameters:
bindingName – The tensor binding name to check
- Returns:
True if the binding is a KV cache tensor
-
inline std::string formatDeepstackFeaturesName(int32_t layerIdx)#
Format deepstack features binding name for a specific layer.
- Parameters:
layerIdx – The layer index
- Returns:
Formatted binding name like “deepstack_features_0”
-
inline std::string formatDeepstackEmbedsName(int32_t embedIdx)#
Format deepstack embeddings binding name for a specific index.
- Parameters:
embedIdx – The embedding index (0, 1, or 2 for Qwen3VL)
- Returns:
Formatted binding name like “deepstack_embeds_0”
-
char const *kInputsEmbeds = "inputs_embeds"#
-
namespace binding_names#
-
namespace binding_names#
Unified tensor binding names for TensorRT engines.
This namespace provides a centralized location for all tensor binding names used across both the builder and runtime components to ensure consistency and avoid duplication.