LLM Runtime Utils#
-
struct Message#
Message with role and contents.
Public Members
-
std::vector<MessageContent> contents#
Contents of the message.
-
struct MessageContent#
-
std::vector<MessageContent> contents#
-
struct MessageContent
Public Members
-
std::string type
Content type (text, image, trajectory)
-
std::string content
Text content when content type is text. Image data will be stored in corresponding imageBuffers. For type “trajectory”, data is stored in Request::pastTrajectory.
-
std::string type
-
struct TokenCallbackInfo#
Per-token callback info delivered during Thinker decode.
-
struct LLMGenerationRequest#
LLM Generation Request structure.
Public Members
-
std::vector<Request> requests#
Vector of requests for a batch.
-
mutable std::vector<FormattedRequest> formattedRequests#
Formatted requests (mutable to allow runtime modification)
-
float temperature#
Temperature parameter for sampling.
-
float topP#
Top-p (nucleus) sampling parameter.
-
int64_t topK#
Top-k sampling parameter.
-
int64_t maxGenerateLength#
Max length of the generated tokens.
-
std::string loraWeightsName = {""}#
Name of the LoRA weights. Default to empty string for no LoRA weights.
-
bool saveSystemPromptKVCache = {false}#
-
bool applyChatTemplate = {true}#
-
bool addGenerationPrompt = {true}#
-
bool enableThinking = {false}#
-
bool disableSpecDecode = {false}#
-
std::vector<std::shared_ptr<StreamChannel>> streamChannels#
Per-slot streaming channels. Size 0 disables streaming globally. When non-empty the size must equal
requests.size()and individual entries may be null to opt out on a per-slot basis. Channels must not already be finished or concurrently attached to another in-flight request.
-
bool generateAudio = {false}#
Whether to enable hidden states capture for Talker pipeline.
-
int32_t acceptHiddenLayer = {0}#
Hidden layer index for Talker (from talker_config.accept_hidden_layer)
-
std::optional<TokenCallback> onTokenGenerated#
Optional per-token callback invoked after each decode step. Called after cudaStreamSynchronize inside the decode loop. When nullopt (default), zero overhead — no callback is invoked.
-
std::vector<Request> requests#
-
struct LLMGenerationResponse#
LLM Generation Response structure.
Public Members
-
std::vector<std::vector<int32_t>> outputIds#
Generated token IDs for each request in the batch.
-
std::vector<std::string> outputTexts#
Future trajectory waypoints (e.g. accel, kappa) per batch item; populated when action engine is used.
Generated text strings for each request in the batch
-
std::vector<std::vector<FutureTrajectoryPoint>> outputTrajectories#
-
std::vector<rt::audioUtils::AudioData> outputAudios#
Generated audio data (Qwen3-Omni only)
-
std::vector<FinishReason> finishReasons#
Why each request halted (EOS, length, stop string, cancel, error); see
runtime/streaming.h.
-
std::vector<std::vector<int32_t>> outputIds#
-
struct LongRopeParams#
Long-Rope specific parameters.
-
struct RopeConfig#
RoPE configuration structure with optional Long-Rope parameters.
Contains common RoPE fields and (optionally) Long-Rope specific parameters when type==kLongRope.
Public Members
-
RopeType type = {RopeType::kDefault}#
Type of RoPE to use.
-
float rotaryScale = {1.0F}#
Scaling factor for rotary embeddings.
-
float rotaryTheta = {100000.0F}#
Base frequency for rotary embeddings.
-
float partialRotaryFactor = {1.0F}#
Fraction of head angles rotated by proportional RoPE.
-
int32_t maxPositionEmbeddings = {32768}#
Maximum position embeddings supported.
-
std::optional<LongRopeParams> longRope = {}#
Long-Rope specific parameters.
-
RopeType type = {RopeType::kDefault}#
-
struct EmbeddingData#
Embedding data - supports both FP16 and FP8 formats.
The embedding table datatype determines the format:
FP16: table is FP16, tableScalingFactor is empty
FP8: table is FP8 (E4M3), tableScalingFactor contains FP32 per-group scales
The kernel functions automatically dispatch based on table.getDataType().
Public Functions