Spec Decode Utils#

char const *trt_edgellm::rt::spec_decode_utils::isGreedyCompatible(
LLMGenerationRequest const &request
) noexcept#

Check whether a generation request is compatible with greedy-only spec-decode sampling.

Returns:

nullptr if compatible; a human-readable reason string if not.

std::unique_ptr<EngineExecutor> trt_edgellm::rt::spec_decode_utils::loadDraftEngine(
std::filesystem::path const &engineDir,
DeploymentConfig &deployment
)#

Load the draft engine from disk and return an EngineExecutor.

void trt_edgellm::rt::spec_decode_utils::appendAcceptedTokens(
DecodingInferenceContext &context,
Tensor &hostAcceptLengths,
Tensor &hostAcceptedTokenIds,
Tensor const &deviceAcceptLength,
Tensor const &deviceAcceptedTokenIds,
int32_t maxAcceptDepth,
tokenizer::Tokenizer const &tokenizer,
cudaStream_t stream
)#

Copy accepted tokens from device buffers into the host-side context token lists.