Binding Names#

namespace trt_edgellm#
namespace binding_names#

Core LLM Input/Output Bindings

char const *kInputsEmbeds = "inputs_embeds"#

Input embeddings tensor - contains the embedded input sequence.

Shape: [batch_size, sequence_length, hidden_size] (FLOAT16)

char const *kContextLengths = "context_lengths"#

Context lengths tensor - specifies the actual length of each sequence in the batch.

Shape: [batch_size] (INT32)

char const *kLastTokenIds = "last_token_ids"#

Last token IDs tensor - indices of the last tokens to extract from hidden states.

Shape: [batch_size] for Eagle models, [batch_size, 1] for vanilla models (INT64)

char const *kLogits = "logits"#

Output logits tensor - probability distribution over vocabulary.

Shape: [batch_size, vocab_size] or [select_tokens, vocab_size] (FLOAT32)

char const *kOutputHiddenStates = "hidden_states"#

Output hidden states tensor - intermediate representations for speculative decoding.

Shape: [batch_size, sequence_length, hidden_dim] (FLOAT16)

Positional Encoding Bindings

char const *kRopeCosSin = "rope_rotary_cos_sin"#

Rotary positional encoding cos/sin cache tensor.

Shape: [batch_size, max_seq_len, rotary_dim] (FLOAT32)

KV Cache Bindings

char const *kKVCacheStartIndex = "kvcache_start_index"#

KV cache start index tensor - starting position for KV cache reuse.

Shape: [batch_size] (INT32)

char const *kPastKeyValuesTemplate = "past_key_values"#

Past key-value cache tensor template - use with layer index formatting.

Template: “past_key_values_{layer_idx}” Shape: [batch_size, 2, num_kv_heads, seq_len, head_dim] (FLOAT16)

char const *kPresentKeyValuesTemplate = "present_key_values"#

Present key-value cache tensor template - use with layer index formatting.

Template: “present_key_values_{layer_idx}” Shape: [batch_size, 2, num_kv_heads, seq_len, head_dim] (FLOAT16)

char const *kKCacheTemplate = "k_cache"#

K cache tensor template for TensorRT native KVCacehUpdate operations - use with layer index formatting.

Template: “k_cache_{layer_idx}” Shape: [batch_size, num_kv_heads, seq_len, head_dim] (FLOAT16)

char const *kVCacheTemplate = "v_cache"#

V cache tensor template for TensorRT native KVCacheUpdate operations - use with layer index formatting.

Template: “v_cache_{layer_idx}” Shape: [batch_size, num_kv_heads, seq_len, head_dim] (FLOAT16)

char const *kPresentKCacheTemplate = "present_k_cache"#

Present K cache tensor template for TensorRT native KVCacheUpdate operations - use with layer index formatting.

Template: “present_k_cache_{layer_idx}” Shape: [batch_size, num_kv_heads, seq_len, head_dim] (FLOAT16)

char const *kPresentVCacheTemplate = "present_v_cache"#

Present V cache tensor template for TensorRT native KVCacheUpdate operations - use with layer index formatting.

Template: “present_v_cache_{layer_idx}” Shape: [batch_size, num_kv_heads, seq_len, head_dim] (FLOAT16)

SSM (Mamba) State Bindings

char const *kSSMStateTemplate = "ssm_state"#

Past SSM state tensor template for Mamba layers.

Template: “ssm_state_{mamba_layer_idx}” Shape: [batch_size, mamba_num_heads, mamba_head_dim, ssm_state_size] (FLOAT16)

char const *kPresentSSMStateTemplate = "present_ssm_state"#

Present SSM state tensor template for Mamba layers.

Template: “present_ssm_state_{mamba_layer_idx}” Shape: [batch_size, mamba_num_heads, mamba_head_dim, ssm_state_size] (FLOAT16)

char const *kConvStateTemplate = "conv_state"#

Past conv state tensor template for Mamba layers.

Template: “conv_state_{mamba_layer_idx}” Shape: [batch_size, conv_dim, conv_kernel_size] (FLOAT16)

char const *kPresentConvStateTemplate = "present_conv_state"#

Present conv state tensor template for Mamba layers.

Template: “present_conv_state_{mamba_layer_idx}” Shape: [batch_size, conv_dim, conv_kernel_size] (FLOAT16)

Eagle Speculative Decoding Bindings

char const *kBaseModelHiddenStates = "hidden_states_input"#

Base model hidden states input for Eagle draft models.

Shape: [batch_size, sequence_length, base_hidden_dim] (FLOAT16)

char const *kDraftModelHiddenStates = "hidden_states_from_draft"#

Draft model hidden states input for Eagle draft models.

Shape: [batch_size, sequence_length, draft_hidden_dim] (FLOAT16)

char const *kAttentionMask = "attention_mask"#

Attention mask for Eagle models - packed tree attention mask.

Shape: [batch_size, tree_size, packed_mask_len] (INT32 for base, INT8 for draft)

char const *kAttentionPosId = "attention_pos_id"#

Attention position IDs for Eagle models.

Shape: [batch_size, tree_size] (INT32)

Visual Encoder Bindings (Qwen-VL, InternVL)

char const *kVisualInput = "input"#

Visual input tensor for vision transformers.

Shape: [sequence_length, input_dim] for Qwen-VL, [num_blocks, channels, height, width] for InternVL

char const *kVisualOutput = "output"#

Visual output tensor from vision transformers.

Shape: [num_image_tokens, hidden_size] (FLOAT16)

char const *kRotaryPosEmb = "rotary_pos_emb"#

Rotary positional embeddings for visual inputs (Qwen-VL specific)

Shape: [sequence_length, embed_dim] (FLOAT32)

char const *kCuSeqlens = "cu_seqlens"#

Cumulative sequence lengths for ragged ViT attention.

Shape: [num_images + 1] (INT32)

char const *kMaxSeqLenCarrier = "max_seqlen_carrier"#

Shape-only input used to convey runtime max sequence-length for FMHA launch.

Shape: [max_seqlen] (INT32)

char const *kCuWindowSeqlens = "cu_window_seqlens"#

Cumulative window sequence lengths for Qwen2.5-VL window attention.

Shape: [num_windows + 1] (INT32)

char const *kWindowIndex = "window_index"#

Window index for Qwen2.5-VL sliding window attention.

Shape: [num_windows] (INT64)

char const *kReverseWindowIndex = "reverse_window_index"#

Reverse window index for Qwen2.5-VL sliding window attention.

Shape: [num_windows] (INT64)

char const *kFastPosEmbIdx = "fast_pos_embed_idx"#

Fast position embeddings index tensor for Qwen3-VL vision model.

Shape: [4, sequence_length] (INT64)

char const *kFastPosEmbWeight = "fast_pos_embed_weight"#

Fast position embeddings weight tensor for Qwen3-VL vision model.

Shape: [4, sequence_length] (FLOAT16)

char const *kDeepstackFeaturesTemplate = "deepstack_features"#

Deepstack features tensor for Qwen3-VL vision model (visual encoder output)

Shape: [num_image_tokens, hidden_size] (FLOAT16)

char const *kDeepstackEmbedsTemplate = "deepstack_embeds"#

Deepstack embeddings tensor template for Qwen3-VL text model (LLM input)

Template: “deepstack_embeds_{layer_idx}” where layer_idx is 0, 1, or 2 Shape: [batch_size, sequence_length, hidden_size] (FLOAT16)

Vocabulary Mapping Configuration

char const *kReducedVocabSizeKey = "reduced_vocab_size"#

JSON configuration key for reduced vocabulary size.

Used to check if the model uses vocabulary reduction optimization

char const *kVocabMapFileName = "vocab_map.safetensors"#

Vocabulary mapping file name.

SafeTensors file containing mapping between full and reduced vocabulary

Audio Encoder Bindings (Qwen3-Omni)

char const *kAudioPaddedFeatures = "padded_feature"#

Audio padded features tensor - chunked and padded Mel-spectrogram.

Shape: [num_chunks, mel_bins, max_chunk_len] (FLOAT16)

char const *kAudioPaddedMaskIndices = "padded_mask_after_cnn_indices"#

Audio padded mask indices - nonzero indices from mask.

Shape: [num_valid_elements, 2] (INT64) Each row is [chunk_idx, position_idx] indicating valid positions after CNN downsampling

char const *kAudioAttentionMask = "attention_mask"#

Audio attention mask - block-diagonal mask for chunk-wise attention.

Shape: [num_attention_elems, num_attention_elems] (FLOAT16) Block-diagonal matrix where each block corresponds to one audio chunk

char const *kAudioOutput = "last_hidden_state"#

Audio encoder output - audio embeddings.

Shape: [num_audio_tokens, hidden_size] (FLOAT16)

CodePredictor Bindings (Qwen3-Omni)

char const *kLmHeadWeight = "lm_head_weight"#

LM head weight tensor - dynamically bound weight for CodePredictor.

Shape: [vocab_size, hidden_size] (FLOAT16) This is used for dynamic lm_head selection in CodePredictor (15 different heads for RVQ layers)

Code2Wav Vocoder Bindings (Qwen3-Omni)

char const *kCode2WavCodes = "codes"#

Code2Wav input codes tensor - RVQ codec codes for vocoder.

Shape: [batch_size, num_quantizers, sequence_length] (INT32) num_quantizers: 15 for Qwen3-Omni

char const *kCode2WavWaveform = "waveform"#

Code2Wav output waveform tensor - generated audio waveform.

Shape: [batch_size, 1, waveform_length] (FLOAT32) Values in range [-1.0, 1.0]

LoRA (Low-Rank Adaptation) Bindings

char const *kLoraAPrefix = "lora_A"#

LoRA A weight matrix prefix - use with layer/component specific suffixes.

Template: “lora_A_{component}_{layer}” Shape: [gemm_k, lora_rank] (FLOAT16)

char const *kLoraBPrefix = "lora_B"#

LoRA B weight matrix prefix - use with layer/component specific suffixes.

Template: “lora_B_{component}_{layer}” Shape: [lora_rank, gemm_n] (FLOAT16)

char const *kEdgellmVersion = "edgellm_version"#

EDGELLM version.

Value: “major.minor.patch.build” Example: “0.5.0.0”

Utility Functions

inline std::string formatKVCacheName(
int32_t layerIdx,
bool isPast = true
)#

Format KV cache binding name for a specific layer.

Parameters:
  • layerIdx – The decoder layer index

  • isPast – Whether this is past (true) or present (false) key-values

Returns:

Formatted binding name like “past_key_values_0” or “present_key_values_0”

inline std::string formatKCacheName(
int32_t layerIdx,
bool isPast = true
)#

Format K cache binding name for a specific layer (TensorRT native operations)

Parameters:
  • layerIdx – The decoder layer index

  • isPast – Whether this is past (true) or present (false) K cache

Returns:

Formatted binding name like “k_cache_0” or “present_k_cache_0”

inline std::string formatVCacheName(
int32_t layerIdx,
bool isPast = true
)#

Format V cache binding name for a specific layer (TensorRT native operations)

Parameters:
  • layerIdx – The decoder layer index

  • isPast – Whether this is past (true) or present (false) V cache

Returns:

Formatted binding name like “v_cache_0” or “present_v_cache_0”

inline std::string formatSSMStateName(
int32_t mambaLayerIdx,
bool isPast = true
)#

Format SSM state binding name for a specific Mamba layer.

Parameters:
  • mambaLayerIdx – The Mamba layer index (0-based, only counting Mamba layers)

  • isPast – Whether this is past (true) or present (false) SSM state

Returns:

Formatted binding name like “ssm_state_0” or “present_ssm_state_0”

inline bool isSSMStateBinding(std::string const &bindingName)#

Check if a binding name is an SSM state tensor.

Parameters:

bindingName – The tensor binding name to check

Returns:

True if the binding is an SSM state tensor

inline std::string formatConvStateName(
int32_t mambaLayerIdx,
bool isPast = true
)#

Format conv state binding name for a specific Mamba layer.

Parameters:
  • mambaLayerIdx – The Mamba layer index (0-based, only counting Mamba layers)

  • isPast – Whether this is past (true) or present (false) conv state

Returns:

Formatted binding name like “conv_state_0” or “present_conv_state_0”

inline bool isConvStateBinding(std::string const &bindingName)#

Check if a binding name is a conv state tensor.

Parameters:

bindingName – The tensor binding name to check

Returns:

True if the binding is a conv state tensor

inline bool isLoraBinding(std::string const &bindingName) noexcept#

Check if a binding name is a LoRA weight tensor.

Parameters:

bindingName – The tensor binding name to check

Returns:

True if the binding is a LoRA weight tensor

inline bool isKVCacheBinding(std::string const &bindingName) noexcept#

Check if a binding name is a KV cache tensor.

Parameters:

bindingName – The tensor binding name to check

Returns:

True if the binding is a KV cache tensor

inline std::string formatDeepstackFeaturesName(int32_t layerIdx)#

Format deepstack features binding name for a specific layer.

Parameters:

layerIdx – The layer index

Returns:

Formatted binding name like “deepstack_features_0”

inline std::string formatDeepstackEmbedsName(int32_t embedIdx)#

Format deepstack embeddings binding name for a specific index.

Parameters:

embedIdx – The embedding index (0, 1, or 2 for Qwen3VL)

Returns:

Formatted binding name like “deepstack_embeds_0”

namespace binding_names#

Unified tensor binding names for TensorRT engines.

This namespace provides a centralized location for all tensor binding names used across both the builder and runtime components to ensure consistency and avoid duplication.