Binding Names#
-
namespace trt_edgellm#
-
namespace binding_names#
Core LLM Input/Output Bindings
-
char const *kInputIds = "input_ids"#
Input token IDs tensor - contains the tokenized input sequence.
Shape: [batch_size, sequence_length] (INT32)
-
char const *kContextLengths = "context_lengths"#
Context lengths tensor - specifies the actual length of each sequence in the batch.
Shape: [batch_size] (INT32)
-
char const *kLastTokenIds = "last_token_ids"#
Last token IDs tensor - indices of the last tokens to extract from hidden states.
Shape: [batch_size] for Eagle models, [batch_size, 1] for vanilla models (INT64)
-
char const *kLogits = "logits"#
Output logits tensor - probability distribution over vocabulary.
Shape: [batch_size, vocab_size] or [select_tokens, vocab_size] (FLOAT32)
-
char const *kOutputHiddenStates = "hidden_states"#
Output hidden states tensor - intermediate representations for speculative decoding.
Shape: [batch_size, sequence_length, hidden_dim] (FLOAT16)
Positional Encoding Bindings
-
char const *kRopeCosSin = "rope_rotary_cos_sin"#
Rotary positional encoding cos/sin cache tensor.
Shape: [batch_size, max_seq_len, rotary_dim] (FLOAT32)
KV Cache Bindings
-
char const *kKVCacheStartIndex = "kvcache_start_index"#
KV cache start index tensor - starting position for KV cache reuse.
Shape: [batch_size] (INT32)
-
char const *kPastKeyValuesTemplate = "past_key_values"#
Past key-value cache tensor template - use with layer index formatting.
Template: “past_key_values.{layer_idx}” Shape: [batch_size, 2, num_kv_heads, seq_len, head_dim] (FLOAT16)
-
char const *kPresentKeyValuesTemplate = "present_key_values"#
Present key-value cache tensor template - use with layer index formatting.
Template: “present_key_values.{layer_idx}” Shape: [batch_size, 2, num_kv_heads, seq_len, head_dim] (FLOAT16)
Eagle Speculative Decoding Bindings
-
char const *kBaseModelHiddenStates = "hidden_states_input"#
Base model hidden states input for Eagle draft models.
Shape: [batch_size, sequence_length, base_hidden_dim] (FLOAT16)
-
char const *kDraftModelHiddenStates = "hidden_states_from_draft"#
Draft model hidden states input for Eagle draft models.
Shape: [batch_size, sequence_length, draft_hidden_dim] (FLOAT16)
-
char const *kAttentionMask = "attention_mask"#
Attention mask for Eagle models - packed tree attention mask.
Shape: [batch_size, tree_size, packed_mask_len] (INT32 for base, INT8 for draft)
-
char const *kAttentionPosId = "attention_pos_id"#
Attention position IDs for Eagle models.
Shape: [batch_size, tree_size] (INT32)
Vision-Language Model (VLM) Bindings
-
char const *kImageEmbeds = "image_embeds"#
Multimodal image embeddings tensor.
Shape: [num_image_tokens, hidden_size] (FLOAT16)
Visual Encoder Bindings (Qwen-VL, InternVL)
-
char const *kVisualInput = "input"#
Visual input tensor for vision transformers.
Shape: [sequence_length, input_dim] for Qwen-VL, [num_blocks, channels, height, width] for InternVL
-
char const *kVisualOutput = "output"#
Visual output tensor from vision transformers.
Shape: [num_image_tokens, hidden_size] (FLOAT16)
-
char const *kRotaryPosEmb = "rotary_pos_emb"#
Rotary positional embeddings for visual inputs (Qwen-VL specific)
Shape: [sequence_length, embed_dim] (FLOAT32)
-
char const *kWindowAttentionMask = "window_attention_mask"#
Window attention mask for Qwen2.5-VL models.
Shape: [1, sequence_length, sequence_length] (FLOAT16)
-
char const *kWindowIndex = "window_index"#
Window index for Qwen2.5-VL sliding window attention.
Shape: [num_windows] (INT64)
-
char const *kReverseWindowIndex = "reverse_window_index"#
Reverse window index for Qwen2.5-VL sliding window attention.
Shape: [num_windows] (INT64)
-
char const *kFastPosEmbIdx = "fast_pos_embed_idx"#
Fast position embeddings index tensor for Qwen3-VL vision model.
Shape: [4, sequence_length] (INT64)
-
char const *kFastPosEmbWeight = "fast_pos_embed_weight"#
Fast position embeddings weight tensor for Qwen3-VL vision model.
Shape: [4, sequence_length] (FLOAT16)
-
char const *kDeepstackFeaturesTemplate = "deepstack_features"#
Deepstack features tensor for Qwen3-VL vision model.
Shape: [num_image_tokens, hidden_size] (FLOAT16)
Vocabulary Mapping Configuration
-
char const *kReducedVocabSizeKey = "reduced_vocab_size"#
JSON configuration key for reduced vocabulary size.
Used to check if the model uses vocabulary reduction optimization
-
char const *kVocabMapFileName = "vocab_map.safetensors"#
Vocabulary mapping file name.
SafeTensors file containing mapping between full and reduced vocabulary
LoRA (Low-Rank Adaptation) Bindings
-
char const *kLoraAPrefix = "lora_A"#
LoRA A weight matrix prefix - use with layer/component specific suffixes.
Template: “lora_A_{component}_{layer}” Shape: [gemm_k, lora_rank] (FLOAT16)
-
char const *kLoraBPrefix = "lora_B"#
LoRA B weight matrix prefix - use with layer/component specific suffixes.
Template: “lora_B_{component}_{layer}” Shape: [lora_rank, gemm_n] (FLOAT16)
-
char const *kEdgellmVersion = "edgellm_version"#
EDGELLM version.
Value: “major.minor.patch.build” Example: “0.4.0.0”
Utility Functions
- inline std::string formatKVCacheName(
- int32_t layerIdx,
- bool isPast = true
Format KV cache binding name for a specific layer.
- Parameters:
layerIdx – The decoder layer index
isPast – Whether this is past (true) or present (false) key-values
- Returns:
Formatted binding name like “past_key_values.0” or “present_key_values.0”
-
inline bool isLoraBinding(std::string const &bindingName)#
Check if a binding name is a LoRA weight tensor.
- Parameters:
bindingName – The tensor binding name to check
- Returns:
True if the binding is a LoRA weight tensor
-
inline bool isKVCacheBinding(std::string const &bindingName)#
Check if a binding name is a KV cache tensor.
- Parameters:
bindingName – The tensor binding name to check
- Returns:
True if the binding is a KV cache tensor
-
inline std::string formatDeepstackFeaturesName(int32_t layerIdx)#
Format deepstack features binding name for a specific layer.
- Parameters:
layerIdx – The layer index
- Returns:
Formatted binding name like “deepstack_features.0”
-
char const *kInputIds = "input_ids"#
-
namespace binding_names#
-
namespace binding_names#
Unified tensor binding names for TensorRT engines.
This namespace provides a centralized location for all tensor binding names used across both the builder and runtime components to ensure consistency and avoid duplication.