Mtp Decoder#

class MTPDecoder : public trt_edgellm::rt::DecodingStrategy#

Public Functions

MTPDecoder(
DecodingRuntimeContext &runtime,
std::filesystem::path const &engineDir,
SpecDecodeDraftingConfig const &draftingConfig,
cudaStream_t stream
)#
inline virtual DecodingStrategyKind kind() const noexcept override#
inline virtual char const *name() const noexcept override#
inline virtual bool isSpeculative() const noexcept override#
virtual char const *unsupportedReason(
LLMGenerationRequest const &request
) const noexcept override#

Check whether this strategy can handle the given request.

Returns:

nullptr if supported; a human-readable reason string if not.

virtual bool decodeStep(DecodingInferenceContext &context) override#
virtual bool captureCudaGraphs(cudaStream_t stream) override#
virtual int64_t getRequiredContextMemorySize(
) const noexcept override#
virtual void setContextMemory(Tensor &memory) override#
virtual bool hasSystemPromptKVCache(
SystemPromptCacheKey const &key
) const override#
virtual void restoreSystemPromptKVCache(
SystemPromptCacheKey const &key,
int32_t batchIdx,
cudaStream_t stream
) override#
virtual bool runSystemPromptPrefill(
DecodingInferenceContext &context
) override#
virtual void saveSystemPromptKVCache(
SystemPromptCacheKey const &key,
std::string const &prompt,
std::vector<tokenizer::Rank> const &tokenizedPrompt,
int32_t promptIdsLength,
cudaStream_t stream
) override#
virtual void resetForNewSequences(
Tensor &reuseLengths,
cudaStream_t stream
) override#
virtual void onBatchEvict(
std::vector<int32_t> const &batchMapping,
int32_t oldActiveBatch,
int32_t newActiveBatch,
Tensor &deviceBatchMapping,
cudaStream_t stream
) override#