Spec Decode Utils#

char const *trt_edgellm::rt::spec_decode_utils::isGreedyCompatible( LLMGenerationRequest const &request ) noexcept#

Check whether a generation request is compatible with greedy-only spec-decode sampling.

Returns:: nullptr if compatible; a human-readable reason string if not.

std::unique_ptr<EngineExecutor> trt_edgellm::rt::spec_decode_utils::loadDraftEngine( std::filesystem::path const &engineDir, DeploymentConfig &deployment )#: Load the draft engine from disk and return an EngineExecutor.

void trt_edgellm::rt::spec_decode_utils::appendAcceptedTokens( DecodingInferenceContext &context, Tensor &hostAcceptLengths, Tensor &hostAcceptedTokenIds, Tensor const &deviceAcceptLength, Tensor const &deviceAcceptedTokenIds, int32_t maxAcceptDepth, tokenizer::Tokenizer const &tokenizer, cudaStream_t stream )#: Copy accepted tokens from device buffers into the host-side context token lists.