LLM Inference Spec Decode Runtime#

class LLMInferenceSpecDecodeRuntime#

LLM inference runtime with Eagle speculative decoding.

Manages inference pipeline using Eagle speculative decoding for improved throughput. Coordinates base model, draft model, and multimodal processing.

Public Functions

LLMInferenceSpecDecodeRuntime(
std::string const &engineDir,
std::string const &multimodalEngineDir,
EagleDraftingConfig const &draftingConfig,
cudaStream_t stream
)#

Construct speculative decode runtime.

Parameters:
  • engineDir – Directory containing engine files

  • multimodalEngineDir – Directory containing multimodal engine files

  • draftingConfig – Eagle drafting configuration

  • stream – CUDA stream for operations

~LLMInferenceSpecDecodeRuntime() = default#

Destructor.

bool captureDraftProposalCudaGraph(cudaStream_t stream)#

Capture CUDA graph for draft proposal.

Parameters:

stream – CUDA stream

Returns:

True on success, false on failure

bool captureDraftAcceptDecodeTokenCudaGraph(cudaStream_t stream)#

Capture CUDA graph for draft accept decode token.

Parameters:

stream – CUDA stream

Returns:

True on success, false on failure

bool captureBaseVerificationCudaGraph(cudaStream_t stream)#

Capture CUDA graph for base verification.

Parameters:

stream – CUDA stream

Returns:

True on success, false on failure

bool handleRequest(
LLMGenerationRequest const &request,
LLMGenerationResponse &response,
cudaStream_t stream
)#

Handle generation request.

Parameters:
  • request – Generation request with prompts and parameters

  • response – Output response with generated tokens and text

  • stream – CUDA stream

Returns:

True on success, false on failure

inline metrics::LLMPrefillMetrics const &getPrefillMetrics() const#

Get LLM prefill stage metrics.

inline metrics::EagleGenerationMetrics const &getEagleGenerationMetrics(
) const#

Get Eagle generation stage metrics.

inline metrics::MultimodalMetrics getMultimodalMetrics() const#

Get multimodal metrics (returns empty metrics if no multimodal runner)

struct SpecDecodeInferenceContext#

Execution context for speculative decode runtime.

Holds execution information and intermediate metadata during inference. Supports multi-batch inference with independent sequence tracking.

Public Functions

void initialize(
int32_t batchSize,
int32_t maxGenLength,
rt::OptionalInputTensor const &multimodal,
rt::OptionalInputTensors const &extraInputTensors,
cudaStream_t cudaStream
)#

Initialize the context with given parameters.

Parameters:
  • batchSize – Active batch size

  • maxGenLength – Maximum generation length

  • multimodal – Optional multimodal embeddings

  • extraInputTensors – Extra input tensors (e.g., deepstack features)

  • cudaStream – CUDA stream for operations

Public Members

std::vector<std::string> systemPrompts#

System prompts for each sequence in batch.

std::vector<std::vector<int32_t>> rawBatchedInputIds#

Original token IDs before preprocessing (includes padding and removal of reused system IDs)

std::vector<std::vector<int32_t>> tokenIds#

Token IDs for each sequence: [batch_size][seq_length].

std::vector<int32_t> currentGenerateLengths#

Current generation length for each sequence: [batch_size].

std::vector<int32_t> promptLengths#

Prompt length (after reuse) for each sequence: [batch_size].

std::vector<int8_t> finishedStates#

Finished state for each sequence: [batch_size] (0=not finished, 1=finished)

std::vector<int32_t> actualIterations#

Actual iterations run for each sequence: [batch_size].

int32_t packedInputLength#

Packed input length for batch processing (max of all sequences, considering engine constraints)

std::unordered_map<int32_t, std::vector<int32_t>> evictedTokenIds#

Token IDs of evicted batches.

std::unordered_map<int32_t, int32_t> evictedGenerateLengths#

Generation lengths of evicted batches.

std::unordered_map<int32_t, int32_t> evictedActualIterations#

Iterations of evicted batches.

std::unordered_map<int32_t, std::string> evictedSystemPrompts#

System prompts of evicted batches.

std::unordered_map<int32_t, std::vector<int32_t>> evictedRawBatchedInputIds#

Raw input IDs of evicted batches.

std::unordered_map<int32_t, int32_t> evictedPromptLengths#

Prompt lengths of evicted batches.

std::vector<int32_t> batchIndexMapping#

Maps current batch index to original index.

rt::OptionalInputTensor multimodalEmbeddings#

Optional multimodal embeddings.

rt::OptionalInputTensors extraInputTensors#

Extra input tensors (e.g., deepstack features)

int32_t generationRound#

Current generation round (shared across all batches)

int32_t maxGenerateLength#

Maximum generation length.

int32_t activeBatchSize#

Current active batch size.

int32_t originalBatchSize#

Original batch size (before any eviction)

int32_t genAndSaveSystemCacheIndex#

Batch index being processed for generating and saving system prompt KVCache.

cudaStream_t stream#

CUDA stream.

struct EagleDraftingConfig#

Drafting configuration for Eagle speculative decoding.

Configuration parameters to drive Eagle spec-decoding.

Public Members

int32_t draftingTopK#

Tokens to select from one predecessor for next draft tree level.

int32_t draftingStep#

Number of drafting steps with draft model.

int32_t verifyTreeSize#

Number of tokens for base model verification.