LLM Inference Spec Decode Runtime#
-
class LLMInferenceSpecDecodeRuntime#
LLM inference runtime with Eagle speculative decoding.
Manages inference pipeline using Eagle speculative decoding for improved throughput. Coordinates base model, draft model, and multimodal processing.
Public Functions
- LLMInferenceSpecDecodeRuntime(
- std::string const &engineDir,
- std::string const &multimodalEngineDir,
- EagleDraftingConfig const &draftingConfig,
- cudaStream_t stream
Construct speculative decode runtime.
- Parameters:
engineDir – Directory containing engine files
multimodalEngineDir – Directory containing multimodal engine files
draftingConfig – Eagle drafting configuration
stream – CUDA stream for operations
-
~LLMInferenceSpecDecodeRuntime() = default#
Destructor.
-
bool captureDraftProposalCudaGraph(cudaStream_t stream)#
Capture CUDA graph for draft proposal.
- Parameters:
stream – CUDA stream
- Returns:
True on success, false on failure
-
bool captureDraftAcceptDecodeTokenCudaGraph(cudaStream_t stream)#
Capture CUDA graph for draft accept decode token.
- Parameters:
stream – CUDA stream
- Returns:
True on success, false on failure
-
bool captureBaseVerificationCudaGraph(cudaStream_t stream)#
Capture CUDA graph for base verification.
- Parameters:
stream – CUDA stream
- Returns:
True on success, false on failure
- bool handleRequest(
- LLMGenerationRequest const &request,
- LLMGenerationResponse &response,
- cudaStream_t stream
Handle generation request.
- Parameters:
request – Generation request with prompts and parameters
response – Output response with generated tokens and text
stream – CUDA stream
- Returns:
True on success, false on failure
-
inline metrics::LLMPrefillMetrics const &getPrefillMetrics() const#
Get LLM prefill stage metrics.
- inline metrics::EagleGenerationMetrics const &getEagleGenerationMetrics(
Get Eagle generation stage metrics.
-
inline metrics::MultimodalMetrics getMultimodalMetrics() const#
Get multimodal metrics (returns empty metrics if no multimodal runner)
-
struct SpecDecodeInferenceContext#
Execution context for speculative decode runtime.
Holds execution information and intermediate metadata during inference. Supports multi-batch inference with independent sequence tracking.
Public Functions
- void initialize(
- int32_t batchSize,
- int32_t maxGenLength,
- rt::OptionalInputTensor const &multimodal,
- rt::OptionalInputTensors const &extraInputTensors,
- cudaStream_t cudaStream
Initialize the context with given parameters.
- Parameters:
batchSize – Active batch size
maxGenLength – Maximum generation length
multimodal – Optional multimodal embeddings
extraInputTensors – Extra input tensors (e.g., deepstack features)
cudaStream – CUDA stream for operations
Public Members
-
std::vector<std::string> systemPrompts#
System prompts for each sequence in batch.
-
std::vector<std::vector<int32_t>> rawBatchedInputIds#
Original token IDs before preprocessing (includes padding and removal of reused system IDs)
-
std::vector<std::vector<int32_t>> tokenIds#
Token IDs for each sequence: [batch_size][seq_length].
-
std::vector<int32_t> currentGenerateLengths#
Current generation length for each sequence: [batch_size].
-
std::vector<int32_t> promptLengths#
Prompt length (after reuse) for each sequence: [batch_size].
-
std::vector<int8_t> finishedStates#
Finished state for each sequence: [batch_size] (0=not finished, 1=finished)
-
std::vector<int32_t> actualIterations#
Actual iterations run for each sequence: [batch_size].
-
int32_t packedInputLength#
Packed input length for batch processing (max of all sequences, considering engine constraints)
-
std::unordered_map<int32_t, std::vector<int32_t>> evictedTokenIds#
Token IDs of evicted batches.
-
std::unordered_map<int32_t, int32_t> evictedGenerateLengths#
Generation lengths of evicted batches.
-
std::unordered_map<int32_t, int32_t> evictedActualIterations#
Iterations of evicted batches.
-
std::unordered_map<int32_t, std::string> evictedSystemPrompts#
System prompts of evicted batches.
-
std::unordered_map<int32_t, std::vector<int32_t>> evictedRawBatchedInputIds#
Raw input IDs of evicted batches.
-
std::unordered_map<int32_t, int32_t> evictedPromptLengths#
Prompt lengths of evicted batches.
-
std::vector<int32_t> batchIndexMapping#
Maps current batch index to original index.
-
int32_t generationRound#
Current generation round (shared across all batches)
-
int32_t maxGenerateLength#
Maximum generation length.
-
int32_t activeBatchSize#
Current active batch size.
-
int32_t originalBatchSize#
Original batch size (before any eviction)
-
int32_t genAndSaveSystemCacheIndex#
Batch index being processed for generating and saving system prompt KVCache.
-
cudaStream_t stream#
CUDA stream.
-
struct EagleDraftingConfig#
Drafting configuration for Eagle speculative decoding.
Configuration parameters to drive Eagle spec-decoding.