Binding Names#

namespace trt_edgellm

namespace binding_names#

Core LLM Input/Output Bindings

constexpr char const *kInputsEmbeds = "inputs_embeds"#

Input embeddings tensor - contains the embedded input sequence.

Shape: [batch_size, sequence_length, hidden_size] (FLOAT16)

constexpr char const *kPleTokenEmbedsTemplate = "ple_token_embeds"#

Gemma4 per-layer token-identity embedding input template.

Template: “ple_token_embeds_{layer_idx}” Shape: batch_size, sequence_length, ple_hidden_size

constexpr char const *kContextLengths = "context_lengths"#

Context lengths tensor - specifies the actual length of each sequence in the batch.

Shape: [batch_size] (INT32)

constexpr char const *kLastTokenIds = "last_token_ids"#

Last token IDs tensor - indices of the last tokens to extract from hidden states.

Shape: [batch_size] for Eagle models, [batch_size, 1] for vanilla models (INT64)

constexpr char const *kLogits = "logits"#

Output logits tensor - probability distribution over vocabulary.

Shape: [batch_size, vocab_size] or [select_tokens, vocab_size] (FLOAT32)

constexpr char const *kOutputHiddenStates = "hidden_states"#

Output hidden states tensor - intermediate representations for speculative decoding.

Shape: [batch_size, sequence_length, hidden_dim] (FLOAT16)

constexpr char const *kDFlashTargetHiddenConcat = "dflash_target_hidden_concat"#

DFlash draft model input: concatenated target hidden states.

Shape: [batch_size, context_length, base_output_hidden_dim] (FLOAT16)

constexpr char const *kDFlashDeltaLengths = "dflash_delta_lengths"#

DFlash draft model input: per-batch delta lengths for multi-batch.

Shape: [batch_size] (INT32)

constexpr char const *kPleEmbeddingFileName = "ple_embedding.safetensors"#

Gemma4 PLE token-identity embedding table sidecar.

Contains tensor “weight” with shape [vocab_size_per_layer_input, num_ple_inputs * ple_hidden_size].

Positional Encoding Bindings

constexpr char const *kRopeCosSin = "rope_rotary_cos_sin"#

Rotary positional encoding cos/sin cache tensor.

Shape: [batch_size, max_seq_len, rotary_dim] (FLOAT32)

constexpr char const *kRopeCosSinSliding = "rope_rotary_cos_sin_sliding"#

Rotary positional encoding cos/sin cache tensor for sliding attention layers.

Shape: [batch_size, max_seq_len, sliding_rotary_dim] (FLOAT32)

constexpr char const *kRopeCosSinFull = "rope_rotary_cos_sin_full"#

Rotary positional encoding cos/sin cache tensor for full attention layers.

Shape: [batch_size, max_seq_len, full_rotary_dim] (FLOAT32)

KV Cache Bindings

constexpr char const *kKVCacheStartIndex = "kvcache_start_index"#

KV cache start index tensor - starting position for KV cache reuse.

Shape: [batch_size] (INT32)

constexpr char const *kPastKeyValuesTemplate = "past_key_values"#

Past key-value cache tensor template - use with layer index formatting.

Template: “past_key_values_{layer_idx}” Shape: [batch_size, 2, num_kv_heads, seq_len, head_dim] (FLOAT16)

constexpr char const *kPresentKeyValuesTemplate = "present_key_values"#

Present key-value cache tensor template - use with layer index formatting.

Template: “present_key_values_{layer_idx}” Shape: [batch_size, 2, num_kv_heads, seq_len, head_dim] (FLOAT16)

constexpr char const *kKCacheTemplate = "k_cache"#

K cache tensor template for TensorRT native KVCacehUpdate operations - use with layer index formatting.

Template: “k_cache_{layer_idx}” Shape: [batch_size, num_kv_heads, seq_len, head_dim] (FLOAT16)

constexpr char const *kVCacheTemplate = "v_cache"#

V cache tensor template for TensorRT native KVCacheUpdate operations - use with layer index formatting.

Template: “v_cache_{layer_idx}” Shape: [batch_size, num_kv_heads, seq_len, head_dim] (FLOAT16)

constexpr char const *kPresentKCacheTemplate = "present_k_cache"#

Present K cache tensor template for TensorRT native KVCacheUpdate operations - use with layer index formatting.

Template: “present_k_cache_{layer_idx}” Shape: [batch_size, num_kv_heads, seq_len, head_dim] (FLOAT16)

constexpr char const *kPresentVCacheTemplate = "present_v_cache"#

Present V cache tensor template for TensorRT native KVCacheUpdate operations - use with layer index formatting.

Template: “present_v_cache_{layer_idx}” Shape: [batch_size, num_kv_heads, seq_len, head_dim] (FLOAT16)

Alpamayo 1 action head input/output bindings

constexpr char const *kNoiseTrajectory = "noise_trajectory"#

Noise trajectory input for one denoising step.

Shape: [batch_size, num_waypoints, 2] (FLOAT32)

constexpr char const *kTimeStepsT0 = "time_steps_t0"#

Denoising time step t0 (start)

Shape: [1] (FLOAT32)

constexpr char const *kTimeStepsT1 = "time_steps_t1"#

Denoising time step t1 (end)

Shape: [1] (FLOAT32)

constexpr char const *kDenoisedTrajectory = "denoised_trajectory"#

Denoised trajectory output from one denoising step.

Shape: [batch_size, num_waypoints, 2] (FLOAT32)

Recurrent State Bindings (Mamba / GDN / linear-attention layers)

constexpr char const *kRecurrentStateTemplate = "recurrent_state"#

Past recurrent state tensor template.

Template: “recurrent_state_{recurrent_layer_idx}” Shape: [batch_size, recurrentNumHeads, recurrentHeadDim, recurrentStateSize]

constexpr char const *kPresentRecurrentStateTemplate = "present_recurrent_state"#

Present recurrent state tensor template.

Template: “present_recurrent_state_{recurrent_layer_idx}” Shape: [batch_size, recurrentNumHeads, recurrentHeadDim, recurrentStateSize]

constexpr char const *kConvStateTemplate = "conv_state"#

Past conv state tensor template for recurrent layers.

Template: “conv_state_{recurrent_layer_idx}” Shape: [batch_size, conv_dim, conv_kernel_size] (FLOAT16)

constexpr char const *kPresentConvStateTemplate = "present_conv_state"#

Present conv state tensor template for recurrent layers.

Template: “present_conv_state_{recurrent_layer_idx}” Shape: [batch_size, conv_dim, conv_kernel_size] (FLOAT16)

constexpr char const *kIntermediateConvStateTemplate = "intermediate_conv_state"#

Intermediate conv state output template for MTP speculative decoding.

Template: “intermediate_conv_state_{recurrent_layer_idx}” Shape: [batch_size, seq_len, conv_dim, conv_kernel_size] (FLOAT16)

constexpr char const *kIntermediateRecurrentStateTemplate = "intermediate_recurrent_state"#

Intermediate recurrent state output template for MTP speculative decoding.

Template: “intermediate_recurrent_state_{recurrent_layer_idx}” Shape: [batch_size, seq_len, recurrentNumHeads, recurrentHeadDim, recurrentStateSize] (FLOAT32)

Eagle Speculative Decoding Bindings

constexpr char const *kBaseModelHiddenStates = "hidden_states_input"#

Base model hidden states input for Eagle draft models.

Shape: [batch_size, sequence_length, base_hidden_dim] (FLOAT16)

constexpr char const *kDraftModelHiddenStates = "hidden_states_from_draft"#

Draft model hidden states input for Eagle draft models.

Shape: [batch_size, sequence_length, draft_hidden_dim] (FLOAT16)

constexpr char const *kAttentionMask = "attention_mask"#

Attention mask for Eagle models - packed tree attention mask.

Shape: [batch_size, tree_size, packed_mask_len] (INT32 for base, INT8 for draft)

constexpr char const *kAttentionPosId = "attention_pos_id"#

Attention position IDs for Eagle models.

Shape: [batch_size, tree_size] (INT32)

Visual Encoder Bindings (Qwen-VL, InternVL)

constexpr char const *kVisualInput = "input"#

Visual input tensor for vision transformers.

Shape: [sequence_length, input_dim] for Qwen-VL, [num_blocks, channels, height, width] for InternVL

constexpr char const *kVisualOutput = "output"#

Visual output tensor from vision transformers.

Shape: [num_image_tokens, hidden_size] (FLOAT16)

constexpr char const *kRotaryPosEmb = "rotary_pos_emb"#

Rotary positional embeddings for visual inputs.

Shape: [sequence_length, embed_dim] (FLOAT32)

constexpr char const *kPixelPositionIds = "pixel_position_ids"#

Gemma4 pixel position ids for visual patch embeddings and 2-D RoPE.

Shape: [sequence_length, 2] (INT64)

constexpr char const *kCuSeqlens = "cu_seqlens"#

Cumulative sequence lengths for ragged ViT attention.

Shape: [num_images + 1] (INT32)

constexpr char const *kKvLengths = "kv_lengths"#

KV sequence lengths for TRT-native attention (TRT >= 11).

Same data as cu_seqlens but must be a separate tensor — TRT IAttentionV2 requires distinct tensors for query_lengths and kv_lengths inputs.

Shape: [num_images + 1] (INT32)

constexpr char const *kMaxSeqLenCarrier = "max_seqlen_carrier"#

Shape-only input used to convey runtime max sequence-length for FMHA launch.

Shape: [max_seqlen] (INT32)

constexpr char const *kPoolingWeights = "pooling_weights"#

Gemma4 position-aware pooling weights.

Shape: [num_image_tokens, sequence_length] (FLOAT16)

constexpr char const *kCuWindowSeqlens = "cu_window_seqlens"#

Cumulative window sequence lengths for Qwen2.5-VL window attention.

Shape: [num_windows + 1] (INT32)

constexpr char const *kWindowIndex = "window_index"#

Window index for Qwen2.5-VL sliding window attention.

Shape: [num_windows] (INT64)

constexpr char const *kReverseWindowIndex = "reverse_window_index"#

Reverse window index for Qwen2.5-VL sliding window attention.

Shape: [num_windows] (INT64)

constexpr char const *kFastPosEmbIdx = "fast_pos_embed_idx"#

Fast position embeddings index tensor for Qwen3-VL vision model.

Shape: [4, sequence_length] (INT64)

constexpr char const *kFastPosEmbWeight = "fast_pos_embed_weight"#

Fast position embeddings weight tensor for Qwen3-VL vision model.

Shape: [4, sequence_length] (FLOAT16)

constexpr char const *kDeepstackFeaturesTemplate = "deepstack_features"#

Deepstack features tensor for Qwen3-VL vision model (visual encoder output)

Shape: [num_image_tokens, hidden_size] (FLOAT16)

constexpr char const *kDeepstackEmbedsTemplate = "deepstack_embeds"#

Deepstack embeddings tensor template for Qwen3-VL text model (LLM input)

Template: “deepstack_embeds_{layer_idx}” where layer_idx is 0, 1, or 2 Shape: [batch_size, sequence_length, hidden_size] (FLOAT16)

Vocabulary Mapping Configuration

constexpr char const *kReducedVocabSizeKey = "reduced_vocab_size"#

JSON configuration key for reduced vocabulary size.

Used to check if the model uses vocabulary reduction optimization

constexpr char const *kVocabMapFileName = "vocab_map.safetensors"#

Vocabulary mapping file name.

SafeTensors file containing mapping between full and reduced vocabulary

Audio Encoder Bindings (Qwen3-Omni)

constexpr char const *kAudioPaddedFeatures = "padded_feature"#

Audio padded features tensor - chunked and padded Mel-spectrogram.

Shape: [num_chunks, mel_bins, max_chunk_len] (FLOAT16)

constexpr char const *kAudioPaddedMaskIndices = "padded_mask_after_cnn_indices"#

Audio padded mask indices - nonzero indices from mask.

Shape: [num_valid_elements, 2] (INT64) Each row is [chunk_idx, position_idx] indicating valid positions after CNN downsampling

constexpr char const *kAudioAttentionMask = "attention_mask"#

Audio attention mask - block-diagonal mask for chunk-wise attention.

Shape: [num_attention_elems, num_attention_elems] (FLOAT16) Block-diagonal matrix where each block corresponds to one audio chunk

constexpr char const *kAudioOutput = "last_hidden_state"#

Audio encoder output - audio embeddings.

Shape: [num_audio_tokens, hidden_size] (FLOAT16)

CodePredictor Bindings (Qwen3-Omni)

constexpr char const *kLmHeadWeight = "lm_head_weight"#

LM head weight tensor - dynamically bound weight for CodePredictor.

Shape: [vocab_size, hidden_size] (FLOAT16) This is used for dynamic lm_head selection in CodePredictor (15 different heads for RVQ layers)

Code2Wav Vocoder Bindings (Qwen3-Omni)

constexpr char const *kCode2WavCodes = "codes"#

Code2Wav input codes tensor - RVQ codec codes for vocoder.

Shape: [batch_size, num_quantizers, sequence_length] (INT32) num_quantizers: 15 for Qwen3-Omni

constexpr char const *kCode2WavWaveform = "waveform"#

Code2Wav output waveform tensor - generated audio waveform.

Shape: [batch_size, 1, waveform_length] (FLOAT32) Values in range [-1.0, 1.0]

LoRA (Low-Rank Adaptation) Bindings

constexpr char const *kLoraAPrefix = "lora_A"#

LoRA A weight matrix prefix - use with layer/component specific suffixes.

Template: “lora_A_{component}_{layer}” Shape: [gemm_k, lora_rank] (FLOAT16)

constexpr char const *kLoraBPrefix = "lora_B"#

LoRA B weight matrix prefix - use with layer/component specific suffixes.

Template: “lora_B_{component}_{layer}” Shape: [lora_rank, gemm_n] (FLOAT16)

constexpr char const *kEdgellmVersion = "edgellm_version"#

EDGELLM version.

Value: “major.minor.patch.build” Example: “0.5.0.0”

Utility Functions

inline std::string formatKVCacheName( int32_t layerIdx, bool isPast = true )#

Format KV cache binding name for a specific layer.

Parameters:

layerIdx – The decoder layer index
isPast – Whether this is past (true) or present (false) key-values

Returns:

Formatted binding name like “past_key_values_0” or “present_key_values_0”

inline std::string formatKCacheName( int32_t layerIdx, bool isPast = true )#

Format K cache binding name for a specific layer (TensorRT native operations)

Parameters:

layerIdx – The decoder layer index
isPast – Whether this is past (true) or present (false) K cache

Returns:

Formatted binding name like “k_cache_0” or “present_k_cache_0”

inline std::string formatVCacheName( int32_t layerIdx, bool isPast = true )#

Format V cache binding name for a specific layer (TensorRT native operations)

Parameters:

layerIdx – The decoder layer index
isPast – Whether this is past (true) or present (false) V cache

Returns:

Formatted binding name like “v_cache_0” or “present_v_cache_0”

inline std::string formatRecurrentStateName( int32_t recurrentLayerIdx, bool isPast = true )#

Format recurrent state binding name for a specific recurrent layer.

Parameters:

recurrentLayerIdx – The recurrent layer index (0-based, only counting recurrent layers)
isPast – Whether this is past (true) or present (false) recurrent state

Returns:

Formatted binding name like “recurrent_state_0” or “present_recurrent_state_0”

inline bool isRecurrentStateBinding(std::string const &bindingName)#

Check if a binding name is a recurrent state tensor.

Parameters:: bindingName – The tensor binding name to check
Returns:: True if the binding is a recurrent state tensor

inline std::string formatConvStateName( int32_t recurrentLayerIdx, bool isPast = true )#

Format conv state binding name for a specific recurrent layer.

Parameters:

recurrentLayerIdx – The recurrent layer index (0-based, only counting recurrent layers)
isPast – Whether this is past (true) or present (false) conv state

Returns:

Formatted binding name like “conv_state_0” or “present_conv_state_0”

inline bool isConvStateBinding(std::string const &bindingName)#

Check if a binding name is a conv state tensor.

Parameters:: bindingName – The tensor binding name to check
Returns:: True if the binding is a conv state tensor

inline std::string formatIntermediateRecurrentStateName( int32_t recurrentLayerIdx )#

Format intermediate recurrent state binding name for MTP.

Parameters:: recurrentLayerIdx – The recurrent layer index (0-based)
Returns:: Formatted binding name like “intermediate_recurrent_state_0”

inline std::string formatIntermediateConvStateName( int32_t recurrentLayerIdx )#

Format intermediate conv state binding name for MTP.

Parameters:: recurrentLayerIdx – The recurrent layer index (0-based)
Returns:: Formatted binding name like “intermediate_conv_state_0”

inline bool isLoraBinding(std::string const &bindingName) noexcept#

Check if a binding name is a LoRA weight tensor.

Parameters:: bindingName – The tensor binding name to check
Returns:: True if the binding is a LoRA weight tensor

inline bool isKVCacheBinding(std::string const &bindingName) noexcept#

Check if a binding name is a KV cache tensor.

Parameters:: bindingName – The tensor binding name to check
Returns:: True if the binding is a KV cache tensor

inline std::string formatDeepstackFeaturesName(int32_t layerIdx)#

Format deepstack features binding name for a specific layer.

Parameters:: layerIdx – The layer index
Returns:: Formatted binding name like “deepstack_features_0”

inline std::string formatDeepstackEmbedsName(int32_t embedIdx)#

Format deepstack embeddings binding name for a specific index.

Parameters:: embedIdx – The embedding index (0, 1, or 2 for Qwen3VL)
Returns:: Formatted binding name like “deepstack_embeds_0”

inline std::string formatPleTokenEmbedsName(int32_t layerIdx)#

Format Gemma4 PLE token embedding input binding name.

Parameters:: layerIdx – The decoder layer index
Returns:: Formatted binding name like “ple_token_embeds_0”

namespace binding_names#

Unified tensor binding names for TensorRT engines.

This namespace provides a centralized location for all tensor binding names used across both the builder and runtime components to ensure consistency and avoid duplication.