Mtp Decoder#

class MTPDecoder : public trt_edgellm::rt::DecodingStrategy #

Public Functions

MTPDecoder( DecodingRuntimeContext &runtime, std::filesystem::path const &engineDir, SpecDecodeDraftingConfig const &draftingConfig, cudaStream_t stream )#

inline virtual DecodingStrategyKind kind() const noexcept override#

inline virtual char const *name() const noexcept override#

inline virtual bool isSpeculative() const noexcept override#

virtual char const *unsupportedReason( LLMGenerationRequest const &request ) const noexcept override#

Check whether this strategy can handle the given request.

Returns:: nullptr if supported; a human-readable reason string if not.

virtual bool decodeStep(DecodingInferenceContext &context) override#

virtual bool captureCudaGraphs(cudaStream_t stream) override#

virtual int64_t getRequiredContextMemorySize( ) const noexcept override#

virtual void setContextMemory(Tensor &memory) override#

virtual bool hasSystemPromptKVCache( SystemPromptCacheKey const &key ) const override#

virtual void restoreSystemPromptKVCache( SystemPromptCacheKey const &key, int32_t batchIdx, cudaStream_t stream ) override#

virtual bool runSystemPromptPrefill( DecodingInferenceContext &context ) override#

virtual void saveSystemPromptKVCache( SystemPromptCacheKey const &key, std::string const &prompt, std::vector<tokenizer::Rank> const &tokenizedPrompt, int32_t promptIdsLength, cudaStream_t stream ) override#

virtual void resetForNewSequences( Tensor &reuseLengths, cudaStream_t stream ) override#

virtual void onBatchEvict( std::vector<int32_t> const &batchMapping, int32_t oldActiveBatch, int32_t newActiveBatch, Tensor &deviceBatchMapping, cudaStream_t stream ) override#