Decoding Strategy#

class DecodingStrategy#

Subclassed by trt_edgellm::rt::EagleDecoder, trt_edgellm::rt::MTPDecoder, trt_edgellm::rt::VanillaDecoder

Public Functions

virtual ~DecodingStrategy() noexcept = default#

virtual DecodingStrategyKind kind() const noexcept = 0#

virtual char const *name() const noexcept = 0#

virtual bool isSpeculative() const noexcept = 0#

virtual char const *unsupportedReason( LLMGenerationRequest const& ) const noexcept = 0#

Check whether this strategy can handle the given request.

Returns:: nullptr if supported; a human-readable reason string if not.

virtual bool decodeStep(DecodingInferenceContext &context) = 0#

virtual bool captureCudaGraphs(cudaStream_t stream) = 0#

virtual int64_t getRequiredContextMemorySize() const noexcept = 0#

virtual void setContextMemory(Tensor&) = 0#

virtual bool hasSystemPromptKVCache( SystemPromptCacheKey const& ) const = 0#

virtual void restoreSystemPromptKVCache( SystemPromptCacheKey const&, int32_t, cudaStream_t ) = 0#

virtual bool runSystemPromptPrefill(DecodingInferenceContext&) = 0#

virtual void saveSystemPromptKVCache( SystemPromptCacheKey const&, std::string const&, std::vector<tokenizer::Rank> const&, int32_t, cudaStream_t ) = 0#

virtual void resetForNewSequences(Tensor&, cudaStream_t) = 0#

virtual void onBatchEvict( std::vector<int32_t> const&, int32_t, int32_t, Tensor&, cudaStream_t ) = 0#

struct SamplingBuffers#

Public Members

Tensor &workspace#

Tensor &indices#

Tensor &scores#

Tensor &baseVocabMappingTable#

Tensor &hostPackedTokenIds#

Tensor &hostSelectedTokenIds#

struct BaseEngineResources#

Base-engine execution infrastructure: executor, tensor map, KV cache, pipeline I/O, shared resources, and CUDA-graph capture callback.

Public Members

EngineExecutor &executor#

TensorMap &tensorMap#

SharedResources &sharedResources#

HybridCacheManager &cacheManager#

PipelineIO &pipelineIO#

std::function<bool(InferenceDims const&, cudaStream_t)> captureGraph#

struct PreprocessResources#

Preprocessing resources: embedding lookup, step preparation, deepstack.

Public Members

StepPreparer &stepPreparer#

EmbeddingPreprocessor &embeddingPreprocessor#

EmbeddingData &embedding#

Tensor &idsInput#

DeepstackBinding *deepstack#

struct DecodingRuntimeContext#

Public Members

DeploymentConfig &deployment#

int32_t maxRuntimeBatchSize#

BaseEngineResources base#

PreprocessResources preprocess#

tokenizer::Tokenizer &tokenizer#

SamplingBuffers sampling#