Decoding Inference Context#
-
struct BatchResult#
Batch result data for a single sequence.
Encapsulates all data needed to track a batch’s execution results, whether the sequence is still active or has already been evicted.
Public Members
-
std::vector<int32_t> tokenIds#
Generated token IDs.
-
std::vector<int32_t> rawBatchedInputIds#
Original input token IDs.
-
int32_t generateLength = {0}#
Number of tokens generated.
-
int32_t actualIterations = {0}#
Number of iterations executed.
-
int32_t effectivePrefillLength = {0}#
Effective prefill length after system prompt cache reuse.
-
FinishReason terminalReason{FinishReason::kNotFinished}#
Why this batch terminated (EOS, length, stop string, cancel, error)
-
std::vector<int32_t> tokenIds#
-
struct DecodingInferenceContext#
Per-request execution context shared by runtime and decoding strategies.
Holds request-local sequence metadata, sampling parameters, multimodal embedding references, streaming state, and batch-eviction bookkeeping.
Public Functions
- void initialize(
- int32_t batchSize,
- int32_t maxGenLength,
- rt::OptionalInputTensor const &visual,
- rt::OptionalInputTensors const &deepstackFeatures,
- std::string const &loraName,
- cudaStream_t cudaStream
Initialize request-local vectors and scalar fields.
- Parameters:
batchSize – Active batch size
maxGenLength – Maximum generation length
visual – Optional visual embeddings
deepstackFeatures – Deepstack features for Qwen3-VL
loraName – LoRA weights name used by this request
cudaStream – CUDA stream for operations
Public Members
-
std::vector<std::string> systemPrompts#
System prompts for each sequence in batch.
-
std::vector<std::vector<int32_t>> rawBatchedInputIds#
Original token IDs before preprocessing.
-
std::vector<std::vector<int32_t>> tokenIds#
Token IDs for each sequence: [batch_size][seq_length].
-
std::vector<int32_t> currentGenerateLengths#
Current generation length for each sequence.
-
std::vector<int32_t> effectivePrefillLengths#
Prefill length after system prompt cache reuse.
-
std::vector<int8_t> finishedStates#
Finished state for each sequence.
-
std::unordered_map<int32_t, BatchResult> completedBatches#
Results of completed batches.
-
std::vector<int32_t> batchIndexMapping#
Maps current batch index to original index.
-
std::vector<SlotStreamState> slotStreams#
Per-slot streaming state.
-
int32_t generationRound = {}#
Current generation round.
-
int32_t maxGenerateLength = {}#
Maximum generation length.
-
int32_t activeBatchSize = {}#
Current active batch size.
-
std::string loraWeightsName = {""}#
LoRA adapter name used by this request.
-
cudaStream_t stream = {}#
CUDA stream.
-
float temperature = {1.0f}#
Temperature for sampling.
-
float topP = {1.0f}#
Top-P sampling parameter.
-
int64_t topK = {0}#
Top-K sampling parameter.
-
std::vector<std::vector<std::string>> stopStringsPerSlot#
-
bool outputThinkerEmbeddings = {false}#
Whether to capture hidden states for the Talker pipeline.
-
std::optional<TokenCallback> onTokenGenerated#
Optional per-token callback invoked after each accepted token update.