Decoding Strategy#

class DecodingStrategy#

Subclassed by trt_edgellm::rt::EagleDecoder, trt_edgellm::rt::MTPDecoder, trt_edgellm::rt::VanillaDecoder

Public Functions

virtual ~DecodingStrategy() noexcept = default#
virtual DecodingStrategyKind kind() const noexcept = 0#
virtual char const *name() const noexcept = 0#
virtual bool isSpeculative() const noexcept = 0#
virtual char const *unsupportedReason(
LLMGenerationRequest const&
) const noexcept = 0#

Check whether this strategy can handle the given request.

Returns:

nullptr if supported; a human-readable reason string if not.

virtual bool decodeStep(DecodingInferenceContext &context) = 0#
virtual bool captureCudaGraphs(cudaStream_t stream) = 0#
virtual int64_t getRequiredContextMemorySize() const noexcept = 0#
virtual void setContextMemory(Tensor&) = 0#
virtual bool hasSystemPromptKVCache(
SystemPromptCacheKey const&
) const = 0#
virtual void restoreSystemPromptKVCache(
SystemPromptCacheKey const&,
int32_t,
cudaStream_t
) = 0#
virtual bool runSystemPromptPrefill(DecodingInferenceContext&) = 0#
virtual void saveSystemPromptKVCache(
SystemPromptCacheKey const&,
std::string const&,
std::vector<tokenizer::Rank> const&,
int32_t,
cudaStream_t
) = 0#
virtual void resetForNewSequences(Tensor&, cudaStream_t) = 0#
virtual void onBatchEvict(
std::vector<int32_t> const&,
int32_t,
int32_t,
Tensor&,
cudaStream_t
) = 0#
struct SamplingBuffers#

Public Members

Tensor &workspace#
Tensor &indices#
Tensor &scores#
Tensor &baseVocabMappingTable#
Tensor &hostPackedTokenIds#
Tensor &hostSelectedTokenIds#
struct BaseEngineResources#

Base-engine execution infrastructure: executor, tensor map, KV cache, pipeline I/O, shared resources, and CUDA-graph capture callback.

Public Members

EngineExecutor &executor#
TensorMap &tensorMap#
SharedResources &sharedResources#
HybridCacheManager &cacheManager#
PipelineIO &pipelineIO#
std::function<bool(InferenceDims const&, cudaStream_t)> captureGraph#
struct PreprocessResources#

Preprocessing resources: embedding lookup, step preparation, deepstack.

Public Members

StepPreparer &stepPreparer#
EmbeddingPreprocessor &embeddingPreprocessor#
EmbeddingData &embedding#
Tensor &idsInput#
DeepstackBinding *deepstack#
struct DecodingRuntimeContext#

Public Members

DeploymentConfig &deployment#
int32_t maxRuntimeBatchSize#
BaseEngineResources base#
PreprocessResources preprocess#
tokenizer::Tokenizer &tokenizer#
SamplingBuffers sampling#