Decoding Strategy#
-
class DecodingStrategy#
Subclassed by trt_edgellm::rt::EagleDecoder, trt_edgellm::rt::MTPDecoder, trt_edgellm::rt::VanillaDecoder
Public Functions
-
virtual ~DecodingStrategy() noexcept = default#
-
virtual DecodingStrategyKind kind() const noexcept = 0#
-
virtual char const *name() const noexcept = 0#
-
virtual bool isSpeculative() const noexcept = 0#
- virtual char const *unsupportedReason(
- LLMGenerationRequest const&
Check whether this strategy can handle the given request.
- Returns:
nullptr if supported; a human-readable reason string if not.
-
virtual bool decodeStep(DecodingInferenceContext &context) = 0#
-
virtual bool captureCudaGraphs(cudaStream_t stream) = 0#
-
virtual int64_t getRequiredContextMemorySize() const noexcept = 0#
- virtual bool hasSystemPromptKVCache(
- SystemPromptCacheKey const&
- virtual void restoreSystemPromptKVCache(
- SystemPromptCacheKey const&,
- int32_t,
- cudaStream_t
-
virtual bool runSystemPromptPrefill(DecodingInferenceContext&) = 0#
- virtual void saveSystemPromptKVCache(
- SystemPromptCacheKey const&,
- std::string const&,
- std::vector<tokenizer::Rank> const&,
- int32_t,
- cudaStream_t
-
virtual ~DecodingStrategy() noexcept = default#
-
struct SamplingBuffers#
-
struct BaseEngineResources#
Base-engine execution infrastructure: executor, tensor map, KV cache, pipeline I/O, shared resources, and CUDA-graph capture callback.
Public Members
-
EngineExecutor &executor#
-
HybridCacheManager &cacheManager#
-
PipelineIO &pipelineIO#
-
std::function<bool(InferenceDims const&, cudaStream_t)> captureGraph#
-
EngineExecutor &executor#
-
struct PreprocessResources#
Preprocessing resources: embedding lookup, step preparation, deepstack.
Public Members
-
StepPreparer &stepPreparer#
-
EmbeddingPreprocessor &embeddingPreprocessor#
-
EmbeddingData &embedding#
-
DeepstackBinding *deepstack#
-
StepPreparer &stepPreparer#
-
struct DecodingRuntimeContext#
Public Members
-
DeploymentConfig &deployment#
-
int32_t maxRuntimeBatchSize#
-
BaseEngineResources base#
-
PreprocessResources preprocess#
-
SamplingBuffers sampling#
-
DeploymentConfig &deployment#