Executor

types.h

template<>
struct TypeTraits<std::int8_t>

Public Static Attributes

static constexpr auto value = DataType::kINT8
template<>
struct TypeTraits<std::int32_t>

Public Static Attributes

static constexpr auto value = DataType::kINT32
template<>
struct TypeTraits<std::int64_t>

Public Static Attributes

static constexpr auto value = DataType::kINT64
template<>
struct TypeTraits<std::uint8_t>

Public Static Attributes

static constexpr auto value = DataType::kUINT8
namespace tensorrt_llm
namespace executor

Typedefs

using TensorPtr = std::shared_ptr<Tensor>
using SizeType32 = std::int32_t
using FloatType = float
using TokenIdType = std::int32_t
using VecTokens = std::vector<TokenIdType>
using BeamTokens = std::vector<VecTokens>
using IdType = std::uint64_t
using VecTokenExtraIds = std::vector<IdType>
using IterationType = std::uint64_t
using RandomSeedType = std::uint64_t
using VecLogProbs = std::vector<FloatType>
using StreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>
using MillisecondsType = std::chrono::milliseconds
using LogitsPostProcessor = std::function<void(IdType, Tensor&, BeamTokens const&, StreamPtr const&, std::optional<IdType>)>
using LogitsPostProcessorMap = std::unordered_map<std::string, LogitsPostProcessor>
using LogitsPostProcessorBatched = std::function<void(std::vector<IdType> const&, std::vector<Tensor>&, std::vector<std::reference_wrapper<BeamTokens const>> const&, StreamPtr const&, std::vector<std::optional<IdType>> const&)>
using MedusaChoices = std::vector<std::vector<SizeType32>>
using EagleChoices = std::vector<std::vector<SizeType32>>
using PriorityType = float
using BufferView = std::basic_string_view<uint8_t>

Enums

enum class DataType

Values:

enumerator kBOOL
enumerator kUINT8
enumerator kINT8
enumerator kINT32
enumerator kINT64
enumerator kBF16
enumerator kFP8
enumerator kFP16
enumerator kFP32
enumerator kUNKNOWN
enum class RequestType

Values:

enumerator REQUEST_TYPE_CONTEXT_AND_GENERATION
enumerator REQUEST_TYPE_CONTEXT_ONLY
enumerator REQUEST_TYPE_GENERATION_ONLY
enum class MemoryType

Values:

enumerator kCPU
enumerator kCPU_PINNED
enumerator kCPU_PINNEDPOOL
enumerator kGPU
enumerator kUVM
enumerator kUNKNOWN
enum class ModelType

Values:

enumerator kDECODER_ONLY
enumerator kENCODER_ONLY
enumerator kENCODER_DECODER
enum class BatchingType

The batching type.

Values:

enumerator kSTATIC

STATIC refers to the traditional batching scheme with a batch of requests running in lockstep until the full generation for all of them is complete. Requests in a batch are all padded up to the maximum input and output sequence length of any member of the batch.

enumerator kINFLIGHT

INFLIGHT refers to a scheme where newly arrived requests are dynamically incorporated into the batch under execution, and requests are returned as soon as the end condition is met without any padding.

enum class CapacitySchedulerPolicy

The policy used to select the subset of available requests in each iteration of the executor generation loop.

Values:

enumerator kMAX_UTILIZATION

MAX_UTILIZATION packs as many requests as the underlying TRT engine can support in any iteration of the InflightBatching generation loop. While this is expected to maximize GPU throughput, it might require that some requests be paused and restarted depending on peak KV cache memory availability.

enumerator kGUARANTEED_NO_EVICT

GUARANTEED_NO_EVICT uses KV cache more conservatively guaranteeing that a request, once started, will run to completion without eviction.

enumerator kSTATIC_BATCH

kSTATIC_BATCH does not schedule new requests until all requests in current batch are completed. Similar to kGUARANTEED_NO_EVICT, requests will run to completion without eviction.

enum class ContextChunkingPolicy

Values:

enumerator kFIRST_COME_FIRST_SERVED

Sequential chunking, complete the unfinished context phase first.

enumerator kEQUAL_PROGRESS

Iterate through each context request in sequence and attempt to increase its chunk count until the constraint is exceeded.

enum class CommunicationType

Values:

enumerator kMPI
enum class CommunicationMode

Values:

enumerator kLEADER
enumerator kORCHESTRATOR
enum class RequestStage

Enum class that represents the state of a request.

Values:

enumerator kQUEUED

Request that have been received but not yet included in the active requests (due to constraints such as maximum batch size for example).

enumerator kENCODER_IN_PROGRESS

Active request in encoder phase.

enumerator kCONTEXT_IN_PROGRESS

Active request in context phase.

enumerator kGENERATION_IN_PROGRESS

Active request in generation phase.

enumerator kGENERATION_COMPLETE

Active request for which generation has completed.

enum class FinishReason

The reason why the model stopped generating tokens for a request.

Values:

enumerator kNOT_FINISHED

The request is not finished.

enumerator kEND_ID

The request finished because the end id was generated.

enumerator kSTOP_WORDS

The request finished because a stop word was generated.

enumerator kLENGTH

The request finished because the maximum number of tokens was reached.

enumerator kTIMED_OUT

The request finished because it got timed out (via the mAllotedTime parameter)

enumerator kCANCELLED

The request was cancelled by calling cancelRequest.

Functions

std::ostream &operator<<(std::ostream &os, CapacitySchedulerPolicy policy)
std::ostream &operator<<(std::ostream &os, ContextChunkingPolicy policy)
template<typename T, bool = false>
struct TypeTraits
#include <types.h>

For converting a C++ data type to a TrtLmmDataType.

template<>
struct TypeTraits<float>

Public Static Attributes

static constexpr auto value = DataType::kFP32
template<>
struct TypeTraits<half>

Public Static Attributes

static constexpr auto value = DataType::kFP16
template<> int8_t >

Public Static Attributes

static constexpr auto value = DataType::kINT8
template<> int32_t >

Public Static Attributes

static constexpr auto value = DataType::kINT32
template<> int64_t >

Public Static Attributes

static constexpr auto value = DataType::kINT64
template<>
struct TypeTraits<bool>

Public Static Attributes

static constexpr auto value = DataType::kBOOL
template<> uint8_t >

Public Static Attributes

static constexpr auto value = DataType::kUINT8
template<typename T>
struct TypeTraits<T*>

Public Static Attributes

static constexpr auto value = DataType::kINT64
struct KvCacheStats
#include <types.h>

Struct that holds the stats of a KV cache manager.

Public Members

SizeType32 maxNumBlocks

Max number of blocks.

SizeType32 freeNumBlocks

Number of free blocks.

SizeType32 usedNumBlocks

Number of used blocks.

SizeType32 tokensPerBlock

Number of tokens per block.

SizeType32 allocTotalBlocks

Number of total allocated block.

SizeType32 allocNewBlocks

Number of newly allocated block.

SizeType32 reusedBlocks

Number of reused block.

SizeType32 missedBlocks

Number of not reused block.

float cacheHitRate

Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).

struct StaticBatchingStats
#include <types.h>

Struct that holds the stats of static batching models for a single iteration.

Public Members

SizeType32 numScheduledRequests

Number of scheduled requests.

SizeType32 numContextRequests

Number of requests in context stage.

SizeType32 numCtxTokens

Total number of context tokens in the iteration.

SizeType32 numGenTokens

Total number of tokens to generate in the iteration.

SizeType32 emptyGenSlots

Total number of unused generation token slots.

struct InflightBatchingStats
#include <types.h>

Struct that holds the stats of inflight batching models for a single iteration.

Public Members

SizeType32 numScheduledRequests

Number of scheduled requests.

SizeType32 numContextRequests

Number of requests in context stage.

SizeType32 numGenRequests

Number of requests in generation stage.

SizeType32 numPausedRequests

Number of paused requests.

SizeType32 numCtxTokens

Total number of context tokens in the iteration.

SizeType32 microBatchId

Index of mirco batch.

float avgNumDecodedTokensPerIter

Average number of tokens decoded per request per iteration.

struct IterationStats
#include <types.h>

Struct that holds the stats of a single iteration.

Public Members

std::string timestamp

Ending time of this iteration.

IterationType iter

Iteration id.

double iterLatencyMS

Iteration latency (ms)

double newActiveRequestsQueueLatencyMS

The total time spent in queue by the requests that became active in this iteration (ms)

SizeType32 numNewActiveRequests

Number of new fetched active requests.

SizeType32 numActiveRequests

Number of active requests.

SizeType32 numQueuedRequests

Number of queued requests.

SizeType32 numCompletedRequests

Number of requests that were completed in this iteration.

SizeType32 maxNumActiveRequests

Number of max active requests.

SizeType32 maxBatchSizeStatic

Static max batch size passed to the executor.

SizeType32 maxBatchSizeTunerRecommended

Batch size produced by dynamic tuner based on input stats.

SizeType32 maxBatchSizeRuntime

@brife The min of maxBatchSizeStatic and maxBatchSizeRuntimeUpperbound

SizeType32 maxNumTokensStatic

@brife Static max num tokens passed to the executor

SizeType32 maxNumTokensTunerRecommended

@brife Max num tokens produced by dynamic tuner based on input stats

SizeType32 maxNumTokensRuntime

@brife The runtime max num tokens

size_t gpuMemUsage

GPU memory usage in bytes.

size_t cpuMemUsage

CPU memory usage in bytes.

size_t pinnedMemUsage

Pinned memory usage in bytes.

std::optional<KvCacheStats> kvCacheStats

Stats specific to KV caches.

std::optional<KvCacheStats> crossKvCacheStats

Stats specific to cross KV caches.

std::optional<StaticBatchingStats> staticBatchingStats

Stats specific to static batching.

std::optional<InflightBatchingStats> inflightBatchingStats

Stats specific to inflight batching.

struct DisServingRequestStats
#include <types.h>

Struct that holds the request stats in the case of disaggregated serving.

Public Members

double kvCacheTransferMS

The total time spent on transferring KV cache from context phase to generation phase (ms)

struct RequestStats
#include <types.h>

Struct that holds the stats of a single request.

Public Members

IdType id

The request id.

RequestStage stage

The current stage the request is in.

SizeType32 contextPrefillPosition

If using chunked context, the current context prefill position.

SizeType32 numGeneratedTokens

The number of generated tokens so far.

float avgNumDecodedTokensPerIter

The average number of decoded tokens per iteration. It is >= 1 for speculative decoding.

bool scheduled

Whether the request is scheduled for the current iteration.

bool paused

Whether the request is being paused at the current iteration due to lack of resources (KV cache blocks exhaustion for example)

std::optional<DisServingRequestStats> disServingStats

Stats specific to disaggregated serving.

SizeType32 allocTotalBlocksPerRequest

Number of total allocated blocks per request.

SizeType32 allocNewBlocksPerRequest

Number of newly allocated blocks per request.

SizeType32 reusedBlocksPerRequest

Number of reused blocks per request.

SizeType32 missedBlocksPerRequest

Number of missed blocks per request.

SizeType32 kvCacheHitRatePerRequest

KV Cache Hit Rate per request, defined as reusedBlocks / (reusedBlocks + missedBlocks)

struct RequestStatsPerIteration
#include <types.h>

Struct that holds the stats of all requests in an iteration.

Public Members

IterationType iter

The iteration id for these stats.

std::vector<RequestStats> requestStats

The stats of all active requests for this iteration.

struct RequestPerfMetrics
#include <types.h>

Struct that holds the stats of a request.

Public Types

using TimePoint = std::chrono::time_point<std::chrono::steady_clock>

Public Members

TimingMetrics timingMetrics
KvCacheMetrics kvCacheMetrics
std::optional<IterationType> firstIter

First iteration where the request was processed.

std::optional<IterationType> lastIter

Last iteration where a token was generated.

std::optional<IterationType> iter

Current iteration.

struct KvCacheMetrics

Public Members

SizeType32 numTotalAllocatedBlocks = {0}

Number of total allocated blocks.

SizeType32 numNewAllocatedBlocks = {0}

Number of newly allocated blocks.

SizeType32 numReusedBlocks = {0}

Number of reused blocks.

SizeType32 numMissedBlocks = {0}

Number of missed blocks.

SizeType32 kvCacheHitRate = {0}

KV Cache Hit Rate, defined as reusedBlocks / (reusedBlocks + missedBlocks)

struct TimingMetrics

Public Members

TimePoint arrivalTime

The time when the request arrived.

TimePoint firstScheduledTime

The time when the request was first scheduled.

TimePoint firstTokenTime

The time when the first token was generated.

TimePoint lastTokenTime

The time when the request was finished.

TimePoint kvCacheTransferStart

Start time of the KV cache transfer for disaggregated serving.

TimePoint kvCacheTransferEnd

End time of the KV cache transfer for disaggregated serving.

struct DebugTensorsPerIteration
#include <types.h>

Struct that holds the debug tensors in an iteration.

Public Members

IterationType iter

The iteration id for these tensors.

std::map<std::string, Tensor> debugTensors

The debug tensors for this iteration.

class DecodingMode
#include <types.h>

mode of the decoder

Public Types

using UnderlyingType = uint32_t

Public Functions

inline constexpr auto useTemperature(bool useTemp)
inline constexpr auto useOccurrencePenalties(bool usePenalty)
inline constexpr auto usePresencePenalty(bool usePenalty)
inline constexpr auto useRepetitionPenalty(bool usePenalty)
inline constexpr auto useFrequencyPenalty(bool usePenalty)
inline constexpr auto useMinLength(bool useMinLen)
inline constexpr auto useBanTokens(bool banTokens)
inline constexpr auto useBanWords(bool banWords)
inline constexpr auto useNoRepeatNgramSize(bool noRepeatNgramSize)
inline constexpr auto useStopWords(bool stopWords)
inline constexpr auto useMaxLengthStop(bool maxLengthStop)
inline constexpr auto useExplicitEosStop(bool explicitEosStop)
inline constexpr bool isAuto() const
inline constexpr bool isTopK() const
inline constexpr bool isTopP() const
inline constexpr bool isTopKorTopP() const
inline constexpr bool isTopKandTopP() const
inline constexpr bool isBeamSearch() const
inline constexpr bool isMedusa() const
inline constexpr bool isLookahead() const
inline constexpr bool isExplicitDraftTokens() const
inline constexpr bool isExternalDraftTokens() const
inline constexpr bool isEagle() const
inline constexpr bool isUseTemperature() const
inline constexpr bool isUsePresencePenalty() const
inline constexpr bool isUseFrequencyPenalty() const
inline constexpr bool isUseRepetitionPenalty() const
inline constexpr bool isUseMinLength() const
inline constexpr bool isUseOccurrencePenalty() const
inline constexpr bool isUsePenalty() const
inline constexpr bool isUseBanWords() const
inline constexpr bool isUseNoRepeatNgramSize() const
inline constexpr bool isUseBanTokens() const
inline constexpr bool isUseStopWords() const
inline constexpr bool isUseMaxLengthStop() const
inline constexpr bool isUseExplicitEosStop() const
inline constexpr bool isUseStopCriteria() const
inline bool operator==(DecodingMode const &other) const
inline explicit constexpr DecodingMode(UnderlyingType state)
inline constexpr UnderlyingType getState() const

Public Static Functions

static inline constexpr auto Auto()

No mode specified. Config will be determined from the beam width of the first request at runtime TopKTopP if beamWidth == 1, BeamSearch otherwise.

static inline constexpr auto TopK()
static inline constexpr auto TopP()
static inline constexpr auto TopKTopP()
static inline constexpr auto BeamSearch()
static inline constexpr auto Medusa()
static inline constexpr auto Lookahead()
static inline constexpr auto ExplicitDraftTokens()
static inline constexpr auto ExternalDraftTokens()
static inline constexpr auto Eagle()

Private Functions

inline constexpr bool anyBitSet(UnderlyingType bits) const
inline constexpr bool allBitSet(UnderlyingType bits) const
inline constexpr UnderlyingType setBitTo(UnderlyingType state, bool x)

Private Members

UnderlyingType mState = {}

Private Static Attributes

static constexpr UnderlyingType kUseRepetitionPenalties = {1u << 0}
static constexpr UnderlyingType kUseFrequencyPenalties = {1u << 1}
static constexpr UnderlyingType kUsePresencePenalties = {1u << 2}
static constexpr UnderlyingType kUseTemperature = {1u << 3}
static constexpr UnderlyingType kUseMinLength = {1u << 4}
static constexpr UnderlyingType kUseBanWords = {1u << 5}
static constexpr UnderlyingType kUseStopWords = {1u << 6}
static constexpr UnderlyingType kUseMaxLengthStop = {1u << 7}
static constexpr UnderlyingType kUseExplicitEosStop = {1u << 8}
static constexpr UnderlyingType kUseNoRepeatNgramSize = {1u << 9}
static constexpr UnderlyingType kStandardStopCriteria = {kUseStopWords | kUseMaxLengthStop}
static constexpr UnderlyingType kUseOccurrencePenalties{kUseRepetitionPenalties | kUseFrequencyPenalties | kUsePresencePenalties}
static constexpr UnderlyingType kUsePenalties = {kUseOccurrencePenalties | kUseTemperature | kUseMinLength}
static constexpr UnderlyingType kUseBanTokens = {kUseNoRepeatNgramSize | kUseBanWords}
static constexpr SizeType32 kNumFlags = {10}
static constexpr UnderlyingType kAuto = {1u << (kNumFlags + 0)}
static constexpr UnderlyingType kTopK = {1u << (kNumFlags + 1)}
static constexpr UnderlyingType kTopP = {1u << (kNumFlags + 2)}
static constexpr UnderlyingType kBeamSearch = {1u << (kNumFlags + 3)}
static constexpr UnderlyingType kMedusa = {1u << (kNumFlags + 4)}
static constexpr UnderlyingType kLookahead = {1u << (kNumFlags + 5)}
static constexpr UnderlyingType kExplicitDraftTokens = {1u << (kNumFlags + 6)}
static constexpr UnderlyingType kExternalDraftTokens = {1u << (kNumFlags + 7)}
static constexpr UnderlyingType kEagle = {1u << (kNumFlags + 8)}
static constexpr UnderlyingType kTopKTopP = {kTopK | kTopP}
namespace runtime

disaggServerUtil.h

namespace tensorrt_llm
namespace executor
namespace disagg_executor
struct ResponseWithId

Public Functions

inline ResponseWithId(tensorrt_llm::executor::Response &&response, IdType gid)
inline ResponseWithId(tensorrt_llm::executor::Response const &response, IdType gid)
inline ResponseWithId(ResponseWithId &&other) noexcept
ResponseWithId(ResponseWithId const &other) = default
inline ResponseWithId &operator=(ResponseWithId &&other) noexcept
inline ResponseWithId &operator=(ResponseWithId const &other)
~ResponseWithId() = default

Public Members

tensorrt_llm::executor::Response response
IdType gid
class DisaggExecutorOrchestrator

Public Functions

DisaggExecutorOrchestrator(std::vector<std::filesystem::path> const &ctxEnginePaths, std::vector<std::filesystem::path> const &genEnginePaths, std::vector<executor::ExecutorConfig> const &ctxExecutorConfigs, std::vector<executor::ExecutorConfig> const &genExecutorConfigs, bool hasContextAwaitThreads, bool hasGenAwaitThreads)

Constructs a DisaggExecutorOrchestrator object.

Parameters:
  • ctxEnginePaths – A vector of file paths to context engine files.

  • genEnginePaths – A vector of file paths to generation engine files.

  • ctxExecutorConfigs – A vector of ExecutorConfig for context executors.

  • genExecutorConfigs – A vector of ExecutorConfig for generation executors.

  • hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.

  • hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.

std::vector<IdType> enqueueContext(std::vector<texec::Request> const &requests, std::optional<int> selectContextId = std::nullopt, bool batch = false)

Enqueue context-only requests to context executors.

Parameters:
  • requests – A vector of context-only requests.

  • selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.

  • batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.

Returns:

A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.

void enqueueGeneration(std::vector<texec::Request> const &requests, std::vector<IdType> const &globalRequestIds, std::optional<int> selectGenIdx = std::nullopt, bool batch = false)

Enqueue generation-only requests to generation executors.

Parameters:
  • requests – A vector of generation-only requests.

  • globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.

  • selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.

  • batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.

std::vector<ResponseWithId> awaitContextResponses(std::optional<std::chrono::milliseconds> const &timeout, std::optional<int> contextIdx = std::nullopt)

Await for context responses.

Parameters:
  • timeout – The maximum time to wait for new responses

  • contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.

Returns:

A vector of responses with corresponding global request ids

std::vector<ResponseWithId> awaitGenerationResponses(std::optional<std::chrono::milliseconds> const &timeout, std::optional<int> genIdx = std::nullopt)

Await for generation responses.

Parameters:
  • timeout – The maximum time to wait for new responses.

  • genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.

Returns:

A vector of responses with corresponding global request ids.

bool canEnqueue() const

Indicates if the current process is allowed to enqueueRequests.

std::vector<std::unique_ptr<texec::Executor>> const &getContextExecutors() const

Get context executors.

std::vector<std::unique_ptr<texec::Executor>> const &getGenExecutors() const

Get generation executors.

~DisaggExecutorOrchestrator()

Private Members

std::unique_ptr<Impl> mImpl

tensor.h

namespace tensorrt_llm
namespace executor
class Shape : public tensorrt_llm::common::ArrayView<detail::DimType64 const>

Public Types

using Base = tensorrt_llm::common::ArrayView<detail::DimType64 const>
using DimType64 = typename std::remove_cv_t<Base::value_type>

Public Functions

inline Shape()
inline Shape(DimType64 const *data, Base::size_type size)
inline Shape(std::initializer_list<DimType64> dims)
class Tensor

Public Types

using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>

Public Functions

Tensor copyToCpu(Tensor::CudaStreamPtr stream = nullptr) const
Tensor copyToPinned(Tensor::CudaStreamPtr stream = nullptr) const
Tensor copyToPooledPinned(Tensor::CudaStreamPtr stream = nullptr) const
Tensor copyToManaged(Tensor::CudaStreamPtr stream = nullptr) const
Tensor copyToGpu(Tensor::CudaStreamPtr stream) const
Tensor() noexcept = default
~Tensor() = default
Tensor(Tensor const &other) noexcept = default
Tensor(Tensor &&other) noexcept = default
Tensor &operator=(Tensor const &other) noexcept = default
Tensor &operator=(Tensor &&other) noexcept = default
void *getData()

Returns a pointer to underlying array.

void const *getData() const

Returns a pointer to underlying array.

DataType getDataType() const

Returns the data type of the buffer.

MemoryType getMemoryType() const

Returns the memory type of the buffer.

Shape getShape() const

Returns the tensor dimensions.

std::size_t getSize() const

Returns the number of elements in the tensor.

std::size_t getSizeInBytes() const

Returns the size of the tensor in bytes.

void setZero(CudaStreamPtr stream = nullptr)

Set the entire memory to zero.

Parameters:

stream – Must be a valid CUDA stream if the memory type is GPU.

void setFrom(Tensor const &other, CudaStreamPtr stream = nullptr)

Copy the data and shape from another tensor.

Parameters:
  • other – A tensor to copy from.

  • stream – Must be a valid CUDA stream if the memory type is GPU.

inline explicit operator bool() const
inline bool operator==(Tensor const &rhs) const
inline bool operator!=(Tensor const &rhs) const

Public Static Functions

static Tensor cpu(DataType dataType, Shape shape = {})

Allocate a cpu tensor with the given shape and data type.

Parameters:
  • shape – The shape of the tensor.

  • dataType – The data type of the tensor.

template<typename T>
static inline Tensor cpu(Shape shape = {})
static Tensor pinned(DataType dataType, Shape shape = {})

Allocate a cpu tensor in pinned memory with the given shape and data type.

Parameters:
  • shape – The shape of the tensor.

  • dataType – The data type of the tensor.

template<typename T>
static inline Tensor pinned(Shape shape = {})
static Tensor pooledPinned(DataType dataType, Shape shape = {})

Allocate a cpu tensor in pooled pinned memory with the given shape and data type.

Parameters:
  • shape – The shape of the tensor.

  • dataType – The data type of the tensor.

template<typename T>
static inline Tensor pooledPinned(Shape shape = {})
static Tensor managed(DataType dataType, Shape shape = {})

Allocate a tensor in managed memory (UVM) with the given shape and data type.

Parameters:
  • shape – The shape of the tensor.

  • dataType – The data type of the tensor.

template<typename T>
static inline Tensor managed(Shape shape = {})
static Tensor gpu(DataType dataType, CudaStreamPtr stream, Shape shape = {})

Allocate a gpu tensor with the given shape and data type on a particular cuda stream.

Parameters:
  • shape – The shape of the tensor.

  • stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

  • dataType – The data type of the tensor.

template<typename T>
static inline Tensor gpu(CudaStreamPtr stream, Shape shape = {})
static Tensor of(DataType dataType, void *data, Shape shape)

Wrap a data pointer into a tensor without taking ownership.

Parameters:
  • shape – The shape of the tensor.

  • dataType – The data type of the tensor.

  • stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

template<typename T>
static inline Tensor of(T *data, Shape shape)

Wrap a data pointer into a tensor without taking ownership.

Parameters:
  • shape – The shape of the tensor.

  • dataType – The data type of the tensor.

  • stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

template<typename T>
static inline Tensor of(T &data)

Wrap any container into a tensor without taking ownership.

Parameters:
  • shape – The shape of the tensor.

  • dataType – The data type of the tensor.

  • stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

Private Types

using Impl = runtime::ITensor

Private Functions

explicit Tensor(std::shared_ptr<runtime::ITensor> tensor)
Tensor copyTo(std::shared_ptr<Impl> tensor, CudaStreamPtr stream) const

Private Members

std::shared_ptr<Impl> mTensor

Private Static Functions

template<typename T>
static inline DataType getRuntimeType()

Friends

friend class Serialization
friend std::shared_ptr<runtime::ITensor> const &toITensor(Tensor const &tensor)
friend Tensor ofITensor(std::shared_ptr<runtime::ITensor> tensor)
namespace detail

Typedefs

using DimType64 = int64_t

Functions

std::shared_ptr<runtime::ITensor> const &toITensor(Tensor const &tensor)
Tensor ofITensor(std::shared_ptr<runtime::ITensor> tensor)
namespace runtime

executor.h

namespace tensorrt_llm
namespace batch_manager
namespace kv_cache_manager
namespace executor

Typedefs

using RetentionPriority = SizeType32
using KVCacheEventData = std::variant<KVCacheCreatedData, KVCacheStoredData, KVCacheRemovedData, KVCacheUpdatedData>

Functions

char const *version() noexcept

Version of TRT-LLM.

class SamplingConfig
#include <executor.h>

Sampling configuration.

Public Functions

explicit SamplingConfig(SizeType32 beamWidth = 1, std::optional<SizeType32> const &topK = std::nullopt, std::optional<FloatType> const &topP = std::nullopt, std::optional<FloatType> const &topPMin = std::nullopt, std::optional<TokenIdType> const &topPResetIds = std::nullopt, std::optional<FloatType> const &topPDecay = std::nullopt, std::optional<RandomSeedType> const &seed = std::nullopt, std::optional<FloatType> const &temperature = std::nullopt, std::optional<SizeType32> const &minTokens = std::nullopt, std::optional<FloatType> const &beamSearchDiversityRate = std::nullopt, std::optional<FloatType> const &repetitionPenalty = std::nullopt, std::optional<FloatType> const &presencePenalty = std::nullopt, std::optional<FloatType> const &frequencyPenalty = std::nullopt, std::optional<FloatType> const &lengthPenalty = std::nullopt, std::optional<SizeType32> const &earlyStopping = std::nullopt, std::optional<SizeType32> const &noRepeatNgramSize = std::nullopt, std::optional<SizeType32> const &numReturnSequences = std::nullopt)

Constructor for SamplingConfig See description of parameters below.

bool operator==(SamplingConfig const &other) const
SizeType32 getBeamWidth() const
SizeType32 getNumReturnBeams() const
std::optional<SizeType32> getTopK() const
std::optional<FloatType> getTopP() const
std::optional<FloatType> getTopPMin() const
std::optional<SizeType32> getTopPResetIds() const
std::optional<FloatType> getTopPDecay() const
std::optional<RandomSeedType> getSeed() const
std::optional<RandomSeedType> getRandomSeed() const
std::optional<FloatType> getTemperature() const
std::optional<SizeType32> getMinTokens() const
std::optional<SizeType32> getMinLength() const
std::optional<FloatType> getBeamSearchDiversityRate() const
std::optional<FloatType> getRepetitionPenalty() const
std::optional<FloatType> getPresencePenalty() const
std::optional<FloatType> getFrequencyPenalty() const
std::optional<FloatType> getLengthPenalty() const
std::optional<SizeType32> getEarlyStopping() const
std::optional<SizeType32> getNoRepeatNgramSize() const
std::optional<SizeType32> getNumReturnSequences() const
void setBeamWidth(SizeType32 beamWidth)
void setTopK(std::optional<SizeType32> const &topK)
void setTopP(std::optional<FloatType> const &topP)
void setTopPMin(std::optional<FloatType> const &topPMin)
void setTopPResetIds(std::optional<TokenIdType> const &topPResetIds)
void setTopPDecay(std::optional<FloatType> const &topPDecay)
void setSeed(std::optional<RandomSeedType> const &seed)
void setRandomSeed(std::optional<RandomSeedType> const &randomSeed)
void setTemperature(std::optional<FloatType> const &temperature)
void setMinTokens(std::optional<SizeType32> const &minTokens)
void setMinLength(std::optional<SizeType32> const &minLength)
void setBeamSearchDiversityRate(std::optional<FloatType> const &beamSearchDiversityRate)
void setRepetitionPenalty(std::optional<FloatType> const &repetitionPenalty)
void setPresencePenalty(std::optional<FloatType> const &presencePenalty)
void setFrequencyPenalty(std::optional<FloatType> const &frequencyPenalty)
void setLengthPenalty(std::optional<FloatType> const &lengthPenalty)
void setEarlyStopping(std::optional<SizeType32> const &earlyStopping)
void setNoRepeatNgramSize(std::optional<SizeType32> const &noRepeatNgramSize)
void setNumReturnSequences(std::optional<SizeType32> const &numReturnSequences)

Private Functions

void updateNumReturnBeams()

Private Members

SizeType32 mBeamWidth

The beam width. Default is 1 which disables beam search.

std::optional<SizeType32> mTopK

Controls number of logits to sample from. Default is 0 (all logits).

std::optional<FloatType> mTopP

Controls the top-P probability to sample from. Default is 0.f.

std::optional<FloatType> mTopPMin

Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6.

std::optional<TokenIdType> mTopPResetIds

Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1.

std::optional<FloatType> mTopPDecay

Controls decay in the top-P algorithm. The decay value. Default is 1.f.

std::optional<RandomSeedType> mSeed

Controls the random seed used by the random number generator in sampling.

std::optional<FloatType> mTemperature

Controls the modulation of logits when sampling new tokens. It can have values > 0.f. Default is 1.0f.

std::optional<SizeType32> mMinTokens

Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1.

std::optional<FloatType> mBeamSearchDiversityRate

Controls the diversity in beam search.

std::optional<FloatType> mRepetitionPenalty

Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. Default is 1.f.

std::optional<FloatType> mPresencePenalty

Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.

std::optional<FloatType> mFrequencyPenalty

Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.

std::optional<FloatType> mLengthPenalty

Controls how to penalize longer sequences in beam search. Default is 0.f.

std::optional<SizeType32> mEarlyStopping

Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token)

std::optional<SizeType32> mNoRepeatNgramSize

Controls how many repeat ngram size are acceptable. Default is 1 << 30.

std::optional<SizeType32> mNumReturnSequences

The number of return sequences or beams. In beam search, the value should be less than or equal to mBeamWidth. In sampling, it specifies the total number of independently generated sequences.

SizeType32 mNumReturnBeams

The number of beams to return. It is equal to beamWidth unless numReturnSequences is set. If beamWidth > 1 and numReturnSequences is set, then numReturnBeams is equal to numReturnSequences.

Private Static Functions

static SizeType32 checkBeamWidth(SizeType32 beamWidth)
static std::optional<FloatType> const &checkTopK(std::optional<FloatType> const &topK)
static std::optional<FloatType> const &checkTopP(std::optional<FloatType> const &topP)
static std::optional<FloatType> const &checkTopPMin(std::optional<FloatType> const &topPMin)
static std::optional<TokenIdType> const &checkTopPResetIds(std::optional<TokenIdType> const &topPResetIds)
static std::optional<FloatType> const &checkTopPDecay(std::optional<FloatType> const &topPDecay)
static std::optional<FloatType> const &checkTemperature(std::optional<FloatType> const &temperature)
static std::optional<FloatType> const &checkRepetitionPenalty(std::optional<FloatType> const &penalty)
static std::optional<SizeType32> const &checkMinTokens(std::optional<SizeType32> const &minTokens)
static std::optional<SizeType32> const &checkNoRepeatNgramSize(std::optional<SizeType32> const &noRepeatNgramSize)
static std::optional<FloatType> const &checkBeamSearchDiversityRate(std::optional<FloatType> const &beamSearchDiversityRate)
static std::optional<SizeType32> const &checkNumReturnSequences(std::optional<SizeType32> const &numReturnSequences, SizeType32 beamWidth)

Friends

friend class Serialization
class OutputConfig
#include <executor.h>

Configuration that controls the outputs of a Result.

Public Functions

explicit OutputConfig(bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false, bool excludeInputFromOutput = false, bool returnEncoderOutput = false, bool returnPerfMetrics = false)

Public Members

bool returnLogProbs

Controls if Result should contain log probabilities. Default is false.

bool returnContextLogits

Controls if Result should contain the context logits. Default is false.

bool returnGenerationLogits

Controls if Result should contain the generation logits. Default is false.

bool excludeInputFromOutput

Controls if output tokens in Result should include the input tokens. Default is false.

bool returnEncoderOutput

Controls if Result should contain encoder output hidden states (for encoder-only and encoder-decoder models). Default is false.

bool returnPerfMetrics

Controls if Result should contain performance metrics.

class ExternalDraftTokensConfig
#include <executor.h>

Configuration for speculative decoding with external draft tokens. Allows to include draft tokens, draft logits and specify acceptance threshold.

Public Functions

explicit ExternalDraftTokensConfig(VecTokens tokens, std::optional<Tensor> logits = std::nullopt, std::optional<FloatType> const &acceptanceThreshold = std::nullopt, std::optional<bool> const &fastLogits = std::nullopt)
VecTokens getTokens() const
std::optional<Tensor> getLogits() const
std::optional<FloatType> getAcceptanceThreshold() const
std::optional<bool> getFastLogits() const

Private Members

VecTokens mTokens

The draft tokens.

std::optional<Tensor> mLogits

The draft logits. Expected shape: [num_draft_tokens, vocab_size].

std::optional<FloatType> mAcceptanceThreshold

The acceptance threshold. Must be > 0.f and <= 1.f.

std::optional<bool> mFastLogits

Use direct transfer for draft logits.

Friends

friend class Serialization
class PromptTuningConfig
#include <executor.h>

Configuration for prompt tuning.

Public Functions

explicit PromptTuningConfig(Tensor embeddingTable, std::optional<VecTokenExtraIds> inputTokenExtraIds = std::nullopt)
Tensor getEmbeddingTable() const
std::optional<VecTokenExtraIds> getInputTokenExtraIds() const

Private Members

Tensor mEmbeddingTable

The prompt embedding table. Expected shape: [task vocab_size, hidden_size]. Data type must match model weights.

std::optional<VecTokenExtraIds> mInputTokenExtraIds

The input token extra ids for KV Cache reuse when p-tuning is enabled.

Friends

friend class Serialization
class MropeConfig
#include <executor.h>

Configuration for mrope.

Public Functions

explicit MropeConfig(Tensor mropeRoratySinCos, SizeType32 mropePositionDeltas)
Tensor getMRopeRotarySinCos() const
SizeType32 getMRopePositionDeltas() const

Private Members

Tensor mMRopeRotarySinCos

The mrope rotary sin and cos cache. Expected shape: [maxPositionEmbeddings*rotaryEmbeddingDim],Data type must float32.

SizeType32 mMRopePositionDeltas

The mrope position deltas.

Friends

friend class Serialization
class LoraConfig
#include <executor.h>

Configuration for LoRA.

Public Functions

explicit LoraConfig(IdType taskId, std::optional<Tensor> weights = std::nullopt, std::optional<Tensor> config = std::nullopt)
IdType getTaskId() const
std::optional<Tensor> getWeights() const
std::optional<Tensor> getConfig() const

Private Members

IdType mTaskId

The Lora task id.

std::optional<Tensor> mWeights

The Lora weights. See TRT-LLM documentation for expected shapes and types.

std::optional<Tensor> mConfig

The Lora configuration. See TRT-LLM documentation for detailed description of the config tensor.

Friends

friend class Serialization
struct LookaheadDecodingConfig

Public Functions

LookaheadDecodingConfig(SizeType32 windowSize, SizeType32 ngramSize, SizeType32 verificationSetSize)
inline explicit LookaheadDecodingConfig()
bool operator==(LookaheadDecodingConfig const &other) const
std::tuple<SizeType32 const, SizeType32 const, SizeType32 const> get() const
SizeType32 getWindowSize() const
SizeType32 getNgramSize() const
SizeType32 getVerificationSetSize() const
std::tuple<SizeType32, SizeType32, SizeType32, SizeType32> calculateSpeculativeResource() const

return <maxDecodingTokens, maxPathLen, maxDraftTokens, maxDraftPathLen>

bool isLE(LookaheadDecodingConfig const &that) const

return true when this can be executed on resources defined by that

Public Static Functions

static bool isLegal(SizeType32 windowSize, SizeType32 ngramSize, SizeType32 verificationSetSize) noexcept

return true when the parameter combination is valid.

Private Members

SizeType32 mWindowSize
SizeType32 mNgramSize
SizeType32 mVerificationSetSize

Friends

friend class Serialization
struct EagleConfig

Public Functions

explicit EagleConfig(std::optional<EagleChoices> eagleChoices = std::nullopt, bool greedySampling = true, std::optional<float> posteriorThreshold = std::nullopt)
bool operator==(EagleConfig const &other) const
std::optional<EagleChoices> getEagleChoices() const
std::optional<float> getPosteriorThreshold() const
bool isGreedySampling() const

Private Functions

std::optional<float> const &checkPosteriorValue(std::optional<float> const &value)

Private Members

std::optional<EagleChoices> mEagleChoices

choices forming tree for EAGLE-1.

bool mGreedySampling

Flag to use greedy or typical acceptance.

std::optional<float> mPosteriorThreshold

Minimum token probability of the typical acceptance. Corresponds to epsilon in https://arxiv.org/pdf/2401.10774. Default is 0.09f.

Friends

friend class Serialization
class ContextPhaseParams

Public Types

using RequestIdType = std::uint64_t

Public Functions

explicit ContextPhaseParams(VecTokens firstGenTokens, RequestIdType reqId)
ContextPhaseParams(VecTokens firstGenTokens, RequestIdType reqId, void *state)
ContextPhaseParams(ContextPhaseParams const&)
ContextPhaseParams(ContextPhaseParams&&) noexcept
ContextPhaseParams &operator=(ContextPhaseParams const&)
ContextPhaseParams &operator=(ContextPhaseParams&&) noexcept
~ContextPhaseParams()
bool operator==(ContextPhaseParams const&) const noexcept
VecTokens const &getFirstGenTokens() const & noexcept
VecTokens popFirstGenTokens() && noexcept
RequestIdType getReqId() const noexcept
void const *getState() const noexcept
void *getState() noexcept
void *releaseState() noexcept

Private Types

using StatePtr = std::unique_ptr<void, decltype(&deleter)>

Private Members

RequestIdType mReqId = {0}

This request corresponds to the request ID in the context phase.

VecTokens mFirstGenTokens

The first tokens generated by context executor.

StatePtr mState = {nullptr, deleter}

Context phase state of this request.

Private Static Functions

static void deleter(void const *data)

Friends

friend class Serialization
class SpeculativeDecodingConfig
#include <executor.h>

Configuration for speculative decoding (both draft and target models)

Public Functions

explicit SpeculativeDecodingConfig(bool fastLogits = false)
bool operator==(SpeculativeDecodingConfig const &other) const

Public Members

bool fastLogits

Send logits tensor directly from draft to target model.

class GuidedDecodingParams
#include <executor.h>

Guided decoding parameters for a request.

Public Types

enum class GuideType

Values:

enumerator kJSON

The generated text is amenable to json format.

enumerator kJSON_SCHEMA

The generated text is amenable to json format with additional user-specified restrictions, namely schema.

enumerator kREGEX

The generated text is amenable to the user-specified regular expression.

enumerator kEBNF_GRAMMAR

The generated text is amenable to the user-specified extended Backus-Naur form (EBNF) grammar. EBNF grammar is widely-used to express context-free grammars.

Public Functions

explicit GuidedDecodingParams(GuideType guideType, std::optional<std::string> guide = std::nullopt)
bool operator==(GuidedDecodingParams const &other) const
GuideType getGuideType() const
std::optional<std::string> getGuide() const

Private Members

GuideType mGuideType

The guide type. See GuideType.

std::optional<std::string> mGuide

The detailed guide string. It could be a json schema, a regular expression or a EBNF grammar depending on mGuideType.

Friends

friend class Serialization
struct RetentionPriorityAndDuration

Public Functions

inline RetentionPriorityAndDuration(std::optional<RetentionPriority> const &retentionPriority, std::optional<std::chrono::milliseconds> const &durationMs)

Public Members

std::optional<RetentionPriority> retentionPriority
std::optional<std::chrono::milliseconds> durationMs
class KvCacheRetentionConfig
#include <executor.h>

Configuration for the request’s retention in the KV Cache.

Public Functions

inline explicit KvCacheRetentionConfig()
explicit KvCacheRetentionConfig(std::vector<TokenRangeRetentionConfig> const &tokenRangeRetentionPriorities, RetentionPriority decodeRetentionPriority = kDefaultRetentionPriority, std::optional<std::chrono::milliseconds> decodeDurationMs = std::nullopt)
std::vector<TokenRangeRetentionConfig> getTokenRangeRetentionConfigs() const
RetentionPriority getDecodeRetentionPriority() const
std::optional<std::chrono::milliseconds> getDecodeDurationMs() const
std::vector<RetentionPriorityAndDuration> getPerBlockRetentionPriorityDuration(SizeType32 blockSize, SizeType32 seqLen) const

Convert the token range data into an entry per kv block. Returns a tuple of vectors corresponding to the priorities and durations for each block.

Public Static Attributes

static constexpr RetentionPriority kMinRetentionPriority = 0
static constexpr RetentionPriority kMaxRetentionPriority = 100
static constexpr RetentionPriority kDefaultRetentionPriority = 35

Private Members

std::vector<TokenRangeRetentionConfig> mTokenRangeRetentionConfigs

The token ranges and priority levels to update. Ranges must be non-overlapping. For example [(0, 64), (100, 128), (70, 80)] is valid, whereas [(0, 64), (60, 128)] is not.

RetentionPriority mDecodeRetentionPriority

The priority level to assign to blocks allocated in the decode phase.

std::optional<std::chrono::milliseconds> mDecodeDurationMs

The duration in ms that decode blocks should remain at their assigned priority level.

struct TokenRangeRetentionConfig
#include <executor.h>

A single entry to set block priorities over a token range. Earlier ranges always take priority over later ones. For example, with a block size of 16, a range of [0, 17] would be applied to the first two blocks.

Public Functions

inline explicit TokenRangeRetentionConfig(SizeType32 tokenStart, std::optional<SizeType32> tokenEnd = std::nullopt, RetentionPriority priority = KvCacheRetentionConfig::kDefaultRetentionPriority, std::optional<std::chrono::milliseconds> durationMs = std::nullopt)
inline bool operator==(TokenRangeRetentionConfig const &other) const

Public Members

SizeType32 tokenStart

The first token of this range.

std::optional<SizeType32> tokenEnd

The final token of this range. The end is not included in the range. This can be set to std::nullopt to extend the range to the end of the sequence.

RetentionPriority priority

The priority of this token range. Higher priorities are less likely to be evicted or offloaded.

std::optional<std::chrono::milliseconds> durationMs

The duration in ms that the block should remain at the given priority level. Set to std::nullopt to have no expiration time, and keep the block at the given priority level until it gets reclaimed. After the duration has passed, the block will be moved back to the kDefaultRetentionPriority level.

class Request
#include <executor.h>

A class that holds information about the request.

Public Functions

Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming = false, SamplingConfig const &samplingConfig = SamplingConfig(), OutputConfig const &outputConfig = OutputConfig(), std::optional<SizeType32> const &endId = std::nullopt, std::optional<SizeType32> const &padId = std::nullopt, std::optional<std::vector<SizeType32>> positionIds = std::nullopt, std::optional<std::list<VecTokens>> badWords = std::nullopt, std::optional<std::list<VecTokens>> stopWords = std::nullopt, std::optional<Tensor> embeddingBias = std::nullopt, std::optional<ExternalDraftTokensConfig> externalDraftTokensConfig = std::nullopt, std::optional<PromptTuningConfig> pTuningConfig = std::nullopt, std::optional<MropeConfig> mRopeConfig = std::nullopt, std::optional<LoraConfig> loraConfig = std::nullopt, std::optional<LookaheadDecodingConfig> lookaheadConfig = std::nullopt, std::optional<KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt, std::optional<std::string> logitsPostProcessorName = std::nullopt, std::optional<VecTokens> encoderInputTokenIds = std::nullopt, std::optional<IdType> clientId = std::nullopt, bool returnAllGeneratedTokens = false, PriorityType priority = kDefaultPriority, RequestType type = RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION, std::optional<ContextPhaseParams> contextPhaseParams = std::nullopt, std::optional<Tensor> encoderInputFeatures = std::nullopt, std::optional<SizeType32> encoderOutputLength = std::nullopt, std::optional<Tensor> crossAttentionMask = std::nullopt, SizeType32 numReturnSequences = 1, std::optional<EagleConfig> eagleConfig = std::nullopt, std::optional<Tensor> skipCrossAttnBlocks = std::nullopt, std::optional<GuidedDecodingParams> guidedDecodingParams = std::nullopt, std::optional<MillisecondsType> allottedTimeMs = std::nullopt)

The Request constructor.

Parameters:
  • inputTokenIds – The input token ids

  • maxTokens – The maximum number of tokens to generate

  • streaming – Indicates if the responses should be streamed or not. Default is false.

  • samplingConfig – The sampling configuration

  • outputConfig – The output configuration

  • endId – The end token id

  • padId – The pad token id

  • positionIds – The input position ids

  • badWords – A list of bad words tokens. Each “word” can be composed of multiple tokens

  • stopWords – A list of stop words tokens. Each “word” can be composed of multiple tokens

  • embeddingBias – The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]

  • externalDraftTokensConfig – The speculative decoding with external draft tokens configuration

  • pTuningConfig – The prompt tuning configuration

  • loraConfig – The LoRA configuration

  • lookaheadConfig – The lookahead speculative decoding configuration

  • logitsPostProcessorName – The logits postprocessor name. Must correspond to one of the logits postprocessor

  • kvCacheRetentionConfig – The configuration used for KV cache block eviction. name provided to the ExecutorConfig.

  • encoderInputTokenIds – The encoder input token ids for encoder-decoder models, or encoder-only models

  • returnAllGeneratedTokens – Indicates whether to return the full beams or just the newly generated tokens after every streaming step.

  • priority – Sets the execution priority of this request.

  • encoderInputFeatures – Encoder input features for multimodal models.

  • encoderOutputLength – Encoder output length if encoder input and output have different lengths (due to convolution down-sampling, etc.)

  • crossAttentionMask – Cross attention mask.

  • type – Indicate the request type for disaggregated serving mode.

  • contextPhaseParams – Generated token ID from context only executor.

  • numReturnSequences – The number of returning sequences.

  • eagleConfig – The EAGLE speculative decoding configuration

  • skipCrossAttnBlocks – Skip the cross attention transformer blocks or not.

  • guidedDecodingParams – The guided decoding parameters.

  • allottedTimeMs – The allotted time in milliseconds after which the request is finished with a timedOut finish reason. The request always will exceed this time slightly, but at most with 1 forward pass. A request can be timed-out before ever being scheduled.

Request(Request const &other)
Request(Request &&other) noexcept
Request &operator=(Request const &other)
Request &operator=(Request &&other) noexcept
~Request()
VecTokens getInputTokenIds() const
SizeType32 getMaxTokens() const
SizeType32 getMaxNewTokens() const
bool getStreaming() const
SamplingConfig getSamplingConfig() const
OutputConfig getOutputConfig() const
std::optional<SizeType32> getEndId() const
std::optional<SizeType32> getPadId() const
std::optional<std::vector<SizeType32>> getPositionIds() const
std::optional<std::list<VecTokens>> getBadWords() const
std::optional<std::list<VecTokens>> getStopWords() const
std::optional<Tensor> getEmbeddingBias() const
std::optional<ExternalDraftTokensConfig> getExternalDraftTokensConfig() const
std::optional<PromptTuningConfig> getPromptTuningConfig() const
std::optional<MropeConfig> getMropeConfig() const
std::optional<LoraConfig> getLoraConfig() const
std::optional<LookaheadDecodingConfig> getLookaheadConfig() const
std::optional<KvCacheRetentionConfig> getKvCacheRetentionConfig() const
std::optional<std::string> getLogitsPostProcessorName() const
std::optional<VecTokens> getEncoderInputTokenIds() const
std::optional<IdType> getClientId() const
PriorityType getPriority() const
bool getReturnAllGeneratedTokens() const
std::optional<ContextPhaseParams> const &getContextPhaseParams() const
std::optional<Tensor> getEncoderInputFeatures() const
std::optional<SizeType32> getEncoderOutputLength() const
std::optional<Tensor> getCrossAttentionMask() const
RequestType getRequestType() const
SizeType32 getNumReturnSequences() const
std::optional<EagleConfig> getEagleConfig() const
std::optional<Tensor> getSkipCrossAttnBlocks() const
std::optional<GuidedDecodingParams> getGuidedDecodingParams() const
std::optional<MillisecondsType> getAllottedTimeMs() const
void setStreaming(bool streaming)
void setSamplingConfig(SamplingConfig const &config)
void setOutputConfig(OutputConfig const &outputConfig)
void setEndId(SizeType32 endId)
void setPadId(SizeType32 padId)
void setPositionIds(std::vector<SizeType32> const &positionIds)
void setBadWords(std::list<VecTokens> const &badWords)
void setStopWords(std::list<VecTokens> const &stopWords)
void setEmbeddingBias(Tensor const &embeddingBias)
void setExternalDraftTokensConfig(ExternalDraftTokensConfig const &externalDraftTokensConfig)
void setPromptTuningConfig(PromptTuningConfig const &pTuningConfig)
void setMropeConfig(MropeConfig const &mRopeConfig)
void setLoraConfig(LoraConfig const &loraConfig)
void setLookaheadConfig(LookaheadDecodingConfig const &lookaheadConfig)
void setKvCacheRetentionConfig(KvCacheRetentionConfig const &kvCacheRetentionConfig)
void setLogitsPostProcessorName(std::string const &logitsPostProcessorName)
void setEncoderInputTokenIds(VecTokens const &encoderInputTokenIds)
void setClientId(IdType clientId)
void setPriority(PriorityType priority)
void setReturnAllGeneratedTokens(bool returnAllGeneratedTokens)
void setRequestType(RequestType const &requestType)
void setContextPhaseParams(ContextPhaseParams contextPhaseParams)
void setEncoderInputFeatures(Tensor encoderInputFeatures)
void setEncoderOutputLength(SizeType32 encoderOutputLength)
void setCrossAttentionMask(Tensor crossAttentionMask)
void setNumReturnSequences(SizeType32 numReturnSequences)
void setEagleConfig(std::optional<EagleConfig> const &eagleConfig)
void setSkipCrossAttnBlocks(Tensor skipCrossAttnBlocks)
void setGuidedDecodingParams(GuidedDecodingParams const &guidedDecodingParams)
void setAllottedTimeMs(MillisecondsType allottedTimeMs)

Public Static Attributes

static constexpr PriorityType kDefaultPriority = 0.5
static constexpr auto kBatchedPostProcessorName = "batched"

This logits postprocessor name will dispatch to the batched logits postprocessor.

Private Members

std::unique_ptr<Impl> mImpl

Friends

friend class Serialization
struct SpeculativeDecodingFastLogitsInfo
#include <executor.h>

Struct that holds the logits information when using direct transfer.

Public Functions

Tensor toTensor() const

Returns the struct serialized into a tensor that can be used as generation logits input.

Public Members

uint64_t draftRequestId

Draft request id.

int32_t draftParticipantId

MPI world rank of the draft model leader.

struct Result
#include <executor.h>

Struct that holds the generation result.

Public Members

bool isFinal

Indicates if this is the final result for the request.

BeamTokens outputTokenIds

The output tokens for each beam.

std::optional<VecLogProbs> cumLogProbs

The cumulative log probabilities. Size beamSize.

std::optional<std::vector<VecLogProbs>> logProbs

The log probabilities for each generated token. Size [beamSize, outputLen].

std::optional<Tensor> contextLogits

The context logits. Size [promptLen, vocabSizePadded].

std::optional<Tensor> generationLogits

The generation logits. Size [beamSize, maxNewTokens, vocabSizePadded] (non-streaming) or [maxNewTokens, beamSize, vocabSizePadded] (streaming and allGeneratedTokens) or [1, beamSize, vocabSizePadded] (streaming and non-allGeneratedTokens)

std::optional<SpeculativeDecodingFastLogitsInfo> specDecFastLogitsInfo

Logits information for direct transfer when using fast logits.

std::optional<Tensor> encoderOutput

The encoder output. Size [encoderLen, hiddenSize].

std::vector<FinishReason> finishReasons

The reason why the model stopped generating tokens for each beam in this request. Size [beamSize]. Currently only supported when beamSize is 1 and when using BatchingType::kINFLIGHT.

std::optional<ContextPhaseParams> contextPhaseParams

The params of the context phase.

SizeType32 decodingIter = {0}

The number of the decoding iterations used to generate the result. In autoregressive decoding, it is equal to the maximum length of the beam in outputTokenIds. In speculative decoding, might be less than maximum length of the beam in outputTokenIds as more than one token can be generated per iteration. Used for speculative decoding statistics.

SizeType32 sequenceIndex = {0}

The index of the output sequence of this result where 0 <= sequenceIndex < numReturnSequences. In beam search (beamWidth > 1), this index will be always zero because all beams to be returned are included in this result.

bool isSequenceFinal

Indicates if this is the final result for a given sequence in the request In beam search (beamWidth > 1), the value will always equal to the value of isFinal.

std::optional<RequestPerfMetrics> requestPerfMetrics

Performance metrics if returnPerfMetrics is set in OutputConfig.

class Response
#include <executor.h>

Class that holds either an error or a result.

Public Functions

Response(IdType requestId, std::string errorMsg, std::optional<IdType> clientId = std::nullopt)
Response(IdType requestId, Result Result, std::optional<IdType> clientId = std::nullopt)
~Response()
Response(Response const &other)
Response(Response &&other) noexcept
Response &operator=(Response const &other)
Response &operator=(Response &&other) noexcept
IdType getRequestId() const

Get the id of the request for which this response was generated.

std::optional<IdType> getClientId() const

Get the client id of the request for which this response was generated.

bool hasError() const

Indicates if this response has an error or not.

std::string const &getErrorMsg() const

Get the error msg for this response Will throw an exception if hasError is false.

Result const &getResult() const

Get the result for this response Will throw an exception if hasResult is true.

Private Members

std::unique_ptr<Impl> mImpl

Friends

friend class Serialization
class DynamicBatchConfig
#include <executor.h>

Configuration class for dynamic tuning of batch size and max num tokens. During runtime the statistics of input and output lengths are recoreded. Based on these statistics, the batch size and max num tokens are tuned dynamically to better serve the requests.

Public Functions

explicit DynamicBatchConfig(bool enableBatchSizeTuning = false, bool enableMaxNumTokensTuning = false, SizeType32 dynamicBatchMovingAverageWindow = kDefaultDynamicBatchMovingAverageWindow, std::vector<std::pair<SizeType32, SizeType32>> batchSizeTable = kDefaultBatchSizeTable)
SizeType32 getDynamicBatchMovingAverageWindow() const
bool getEnableBatchSizeTuning() const
bool getEnableMaxNumTokensTuning() const
std::vector<std::pair<SizeType32, SizeType32>> getBatchSizeTable() const

Public Static Attributes

static SizeType32 const kDefaultDynamicBatchMovingAverageWindow = 128

The default window size for moving average of input and output length which is used to calculate dynamic batch size and max num tokens.

static std::vector<std::pair<SizeType32, SizeType32>> const kDefaultBatchSizeTable

The default value of batch size table.

Private Members

bool mEnableBatchSizeTuning

Controls if the batch size should be tuned dynamically.

bool mEnableMaxNumTokensTuning

Controls if the max num tokens should be tuned dynamically.

SizeType32 mDynamicBatchMovingAverageWindow

The window size for moving average of input and output length which is used to calculate dynamic batch size and max num tokens.

std::vector<std::pair<SizeType32, SizeType32>> mBatchSizeTable

A vector of (batchSizeLimit, batchSize). When max capacity batch size is less than.

Friends

friend class Serialization
class SchedulerConfig
#include <executor.h>

Configuration class for the scheduler.

Public Functions

explicit SchedulerConfig(CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, std::optional<ContextChunkingPolicy> contextChunkingPolicy = std::nullopt, std::optional<DynamicBatchConfig> dynamicBatchConfig = std::nullopt)
bool operator==(SchedulerConfig const &other) const
CapacitySchedulerPolicy getCapacitySchedulerPolicy() const
std::optional<ContextChunkingPolicy> getContextChunkingPolicy() const
std::optional<DynamicBatchConfig> getDynamicBatchConfig() const

Private Members

CapacitySchedulerPolicy mCapacitySchedulerPolicy

The capacity scheduler policy. See CapacitySchedulerPolicy.

std::optional<ContextChunkingPolicy> mContextChunkingPolicy

The context chunking policy. See ContextChunkingPolicy.

std::optional<DynamicBatchConfig> mDynamicBatchConfig

The config for tuning batch size dynamically. See DynamicBatchSizeConfig.

Friends

friend class Serialization
class KvCacheConfig
#include <executor.h>

Configuration class for the KV cache.

Public Functions

explicit KvCacheConfig(bool enableBlockReuse = false, std::optional<SizeType32> const &maxTokens = std::nullopt, std::optional<std::vector<SizeType32>> const &maxAttentionWindowVec = std::nullopt, std::optional<SizeType32> const &sinkTokenLength = std::nullopt, std::optional<FloatType> const &freeGpuMemoryFraction = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt, bool onboardBlocks = true, std::optional<FloatType> const &crossKvCacheFraction = std::nullopt, std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt, size_t eventBufferMaxSize = 0, std::optional<tensorrt_llm::runtime::RuntimeDefaults> const &runtimeDefaults = std::nullopt)
bool getEnableBlockReuse() const
std::optional<SizeType32> getMaxTokens() const
std::optional<std::vector<SizeType32>> getMaxAttentionWindowVec() const
std::optional<SizeType32> getSinkTokenLength() const
std::optional<FloatType> getFreeGpuMemoryFraction() const
std::optional<FloatType> getCrossKvCacheFraction() const
std::optional<size_t> getHostCacheSize() const
bool getOnboardBlocks() const
std::optional<RetentionPriority> getSecondaryOffloadMinPriority() const
size_t getEventBufferMaxSize() const
void setEnableBlockReuse(bool enableBlockReuse)
void setMaxTokens(SizeType32 maxTokens)
void setMaxAttentionWindowVec(std::vector<SizeType32> maxAttentionWindowVec)
void setSinkTokenLength(SizeType32 sinkTokenLength)
void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction)
void setCrossKvCacheFraction(FloatType crossKvCacheFraction)
void setHostCacheSize(size_t hostCacheSize)
void setOnboardBlocks(bool onboardBlocks)
void setSecondaryOffloadMinPriority(std::optional<RetentionPriority> secondaryOffloadMinPriority)
void setEventBufferMaxSize(size_t eventBufferMaxSize)
void fillEmptyFieldsFromRuntimeDefaults(tensorrt_llm::runtime::RuntimeDefaults runtimeDefaults)

Private Members

bool mEnableBlockReuse

Controls if KV cache blocks can be reused for different requests.

std::optional<SizeType32> mMaxTokens

The maximum number of tokens that should be stored in the KV cache If both mMaxTokens and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.

std::optional<std::vector<SizeType32>> mMaxAttentionWindowVec

Size of the attention window for each sequence. Only the last mMaxAttentionWindow tokens of each sequence will be stored in the KV cache. Different layers may have different max attention window sizes. If the number of elements in mMaxAttentionWindowVec is less than the number of layers, mMaxAttentionWindowVec will be repeated multiple times to the number of layers.

std::optional<SizeType32> mSinkTokenLength

Number of sink tokens (tokens to always keep in attention window)

std::optional<FloatType> mFreeGpuMemoryFraction

The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both mMaxTokens and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.

std::optional<FloatType> mCrossKvCacheFraction

The fraction of the KV Cache memory should be reserved for cross attention If set to p, self attention will use 1-p of KV Cache memory and cross attention will use p of KV Cache memory. Default is 50%. Should only be set when using encoder-decoder model.

std::optional<size_t> mHostCacheSize

Size of secondary memory pool in bytes. Default is 0. Having a secondary memory pool increases KV cache block reuse potential.

bool mOnboardBlocks

Controls whether offloaded blocks should be onboarded back into primary memory before being reused.

std::optional<RetentionPriority> mSecondaryOffloadMinPriority

Only blocks with priority > mSecondaryOfflineMinPriority can be offloaded to secondary memory.

size_t mEventBufferMaxSize

Max size of the KV cache event buffer.

Friends

friend class Serialization
class ExtendedRuntimePerfKnobConfig
#include <executor.h>

Configuration class for the runtime perf knobs.

Public Functions

explicit ExtendedRuntimePerfKnobConfig(bool multiBlockMode = true, bool enableContextFMHAFP32Acc = false, bool cudaGraphMode = false, SizeType32 cudaGraphCacheSize = 0)
inline bool operator==(ExtendedRuntimePerfKnobConfig const &other) const
bool getMultiBlockMode() const
bool getEnableContextFMHAFP32Acc() const
bool getCudaGraphMode() const
SizeType32 getCudaGraphCacheSize() const
void setMultiBlockMode(bool multiBlockMode)
void setEnableContextFMHAFP32Acc(bool enableContextFMHAFP32Acc)
void setCudaGraphMode(bool cudaGraphMode)
void setCudaGraphCacheSize(SizeType32 cacheSize)

Private Members

bool mMultiBlockMode

Control if multi block mode should be enabled or not.

bool mEnableContextFMHAFP32Acc

If enable FMHA runner FP32 accumulation.

bool mCudaGraphMode

Control if enable cuda graph.

SizeType32 mCudaGraphCacheSize

Number of cuda graphs to be cached in the runtime. The larger the cache, the better the perf, but more GPU memory is consumed.

Friends

friend class Serialization
class DebugConfig
#include <executor.h>

Configuration class for debugging output.

Public Functions

explicit DebugConfig(bool debugInputTensors = false, bool debugOutputTensors = false, StringVec debugTensorNames = {}, SizeType32 debugTensorsMaxIterations = 0)
bool operator==(DebugConfig const &other) const
bool getDebugInputTensors() const
bool getDebugOutputTensors() const
StringVec const &getDebugTensorNames() const
SizeType32 getDebugTensorsMaxIterations() const
void setDebugInputTensors(bool debugInputTensors)
void setDebugOutputTensors(bool debugOutputTensors)
void setDebugTensorNames(StringVec const &debugTensorNames)
void setDebugTensorsMaxIterations(SizeType32 debugTensorsMaxIterations)

Private Types

using StringVec = std::vector<std::string>

Private Members

bool mDebugInputTensors

If true, debug all input tensors.

bool mDebugOutputTensors

If true, debug all output tensors.

StringVec mDebugTensorNames

If not empty, only debug tensors in this list.

SizeType32 mDebugTensorsMaxIterations

If > 0, provide debug tensors for at most debugTensorsMaxIterations past iterations, else dump them to files.

Friends

friend class Serialization
class OrchestratorConfig

Public Functions

explicit OrchestratorConfig(bool isOrchestrator = true, std::string workerExecutablePath = "", std::shared_ptr<mpi::MpiComm> orchLeaderComm = nullptr, bool spawnProcesses = true)
bool getIsOrchestrator() const
std::string getWorkerExecutablePath() const
std::shared_ptr<mpi::MpiComm> getOrchLeaderComm() const
bool getSpawnProcesses() const
void setIsOrchestrator(bool isOrchestrator)
void setWorkerExecutablePath(std::string const &workerExecutablePath)
void setOrchLeaderComm(std::shared_ptr<mpi::MpiComm> const &orchLeaderComm)
void setSpawnProcesses(bool spawnProcesses)

Private Members

bool mIsOrchestrator
std::string mWorkerExecutablePath
std::shared_ptr<mpi::MpiComm> mOrchLeaderComm
bool mSpawnProcesses
class ParallelConfig
#include <executor.h>

A configuration class for the parallel execution parameters Currently only supports commType = CommunicationType::kMPI.

Public Functions

explicit ParallelConfig(CommunicationType commType = CommunicationType::kMPI, CommunicationMode commMode = CommunicationMode::kLEADER, std::optional<std::vector<SizeType32>> deviceIds = std::nullopt, std::optional<std::vector<SizeType32>> participantIds = std::nullopt, std::optional<OrchestratorConfig> const &orchestratorConfig = std::nullopt)

Constructor.

Parameters:
  • commType – The communication type. See CommunicationType.

  • commMode – The communication mode. See CommunicationMode.

  • deviceIds – The IDs of the GPUs involved in the execution of the model

  • participantIds – The participant IDs (MPI ranks if commType == kMPI) involved in the execution of the model. The first participant is considered to be the leader.

CommunicationType getCommunicationType() const
CommunicationMode getCommunicationMode() const
std::optional<std::vector<SizeType32>> getDeviceIds() const
std::optional<std::vector<SizeType32>> getParticipantIds() const
std::optional<OrchestratorConfig> getOrchestratorConfig() const
void setCommunicationType(CommunicationType type)
void setCommunicationMode(CommunicationMode mode)
void setDeviceIds(std::vector<SizeType32> const &deviceIds)
void setParticipantIds(std::vector<SizeType32> const &participantIds)
void setOrchestratorConfig(OrchestratorConfig const &orchestratorConfig)

Private Members

CommunicationType mCommType

The type of communication protocol used. Default is MPI.

CommunicationMode mCommMode

The mode of communication. See CommunicationMode.

std::optional<std::vector<SizeType32>> mDeviceIds

The GPU device ids to use for executing this model.

std::optional<std::vector<SizeType32>> mParticipantIds

The participant ids (MPI ranks for example) used for executing this model.

std::optional<OrchestratorConfig> mOrchestratorConfig

Optional orchestrator configuration.

Friends

friend class Serialization
class PeftCacheConfig
#include <executor.h>

config for PeftCacheManager

Public Functions

explicit PeftCacheConfig(SizeType32 numHostModuleLayer = 0, SizeType32 numDeviceModuleLayer = 0, SizeType32 optimalAdapterSize = kDefaultOptimalAdapterSize, SizeType32 maxAdapterSize = kDefaultMaxAdapterSize, SizeType32 numPutWorkers = 1, SizeType32 numEnsureWorkers = 1, SizeType32 numCopyStreams = 1, SizeType32 maxPagesPerBlockHost = kDefaultMaxPagesPerBlockHost, SizeType32 maxPagesPerBlockDevice = kDefaultMaxPagesPerBlockDevice, std::optional<float> const &deviceCachePercent = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt)
bool operator==(PeftCacheConfig const &other) const
SizeType32 getNumHostModuleLayer() const
SizeType32 getNumDeviceModuleLayer() const
SizeType32 getOptimalAdapterSize() const
SizeType32 getMaxAdapterSize() const
SizeType32 getNumPutWorkers() const
SizeType32 getNumEnsureWorkers() const
SizeType32 getNumCopyStreams() const
SizeType32 getMaxPagesPerBlockHost() const
SizeType32 getMaxPagesPerBlockDevice() const
std::optional<float> getDeviceCachePercent() const
std::optional<size_t> getHostCacheSize() const

Public Static Attributes

static constexpr SizeType32 kDefaultOptimalAdapterSize = 8
static constexpr SizeType32 kDefaultMaxAdapterSize = 64
static constexpr SizeType32 kDefaultMaxPagesPerBlockHost = 24
static constexpr SizeType32 kDefaultMaxPagesPerBlockDevice = 8

Private Members

SizeType32 mNumHostModuleLayer
SizeType32 mNumDeviceModuleLayer
SizeType32 mOptimalAdapterSize
SizeType32 mMaxAdapterSize
SizeType32 mNumPutWorkers
SizeType32 mNumEnsureWorkers
SizeType32 mNumCopyStreams
SizeType32 mMaxPagesPerBlockHost
SizeType32 mMaxPagesPerBlockDevice
std::optional<FloatType> mDeviceCachePercent
std::optional<size_t> mHostCacheSize

Friends

friend class Serialization
class DecodingConfig
#include <executor.h>

Configuration class for the decoding.

Public Functions

explicit DecodingConfig(std::optional<DecodingMode> decodingMode = std::nullopt, std::optional<LookaheadDecodingConfig> lookaheadDecodingConfig = std::nullopt, std::optional<MedusaChoices> medusaChoices = std::nullopt, std::optional<EagleConfig> eagleConfig = std::nullopt)
bool operator==(DecodingConfig const &other) const
void setDecodingMode(DecodingMode const&)

Sets decoding mode. Some modes require the use of their own setters.

std::optional<DecodingMode> getDecodingMode() const
void setLookaheadDecoding(LookaheadDecodingConfig const &lookaheadDecodingConfig)

Sets lookahead decoding mode and config.

std::optional<LookaheadDecodingConfig> getLookaheadDecodingConfig() const
void setMedusaChoices(MedusaChoices const&)

Sets medusa mode and config.

std::optional<MedusaChoices> getMedusaChoices() const
void setEagleConfig(EagleConfig const&)

Sets eagle mode and config.

std::optional<EagleConfig> getEagleConfig() const

Private Members

std::optional<DecodingMode> mDecodingMode
std::optional<LookaheadDecodingConfig> mLookaheadDecodingConfig
std::optional<MedusaChoices> mMedusaChoices
std::optional<EagleConfig> mEagleConfig

Friends

friend class Serialization
class GuidedDecodingConfig
#include <executor.h>

Guided decoding configurations for executor.

Public Types

enum class GuidedDecodingBackend

Values:

enumerator kXGRAMMAR

Enable guided decoding with XGrammar backend.

Public Functions

explicit GuidedDecodingConfig(GuidedDecodingBackend backend, std::optional<std::vector<std::string>> encodedVocab = std::nullopt, std::optional<std::string> tokenizerStr = std::nullopt, std::optional<std::vector<TokenIdType>> stopTokenIds = std::nullopt)
bool operator==(GuidedDecodingConfig const &other) const
void setBackend(GuidedDecodingBackend const &backend)
GuidedDecodingBackend getBackend() const
void setEncodedVocab(std::vector<std::string> const &encodedVocab)
std::optional<std::vector<std::string>> getEncodedVocab() const
void setTokenizerStr(std::string const &tokenizerStr)
std::optional<std::string> getTokenizerStr() const
void setStopTokenIds(std::vector<TokenIdType> const &stopTokenIds)
std::optional<std::vector<TokenIdType>> getStopTokenIds() const
void validate() const

Private Members

GuidedDecodingBackend mBackend

Guided decoding backend. Currently supports XGrammar.

std::optional<std::vector<std::string>> mEncodedVocab

Encoded vocabulary. For a huggingface tokenizer, it can be extracted by:

encoded_vocab = tokenizer.get_vocab()
encoded_vocab = [token for token, _ in sorted(encoded_vocab.items(), key=lambda x: x[1])]
std::optional<std::string> mTokenizerStr

Tokenizer string. For a huggingface fast tokenizer, it can be extracted by:

tokenizer_str = tokenizer.backend_tokenizer.to_str()
std::optional<std::vector<TokenIdType>> mStopTokenIds

Stop token ids. If not provided, it can be automatically detected.

Friends

friend class Serialization
class LogitsPostProcessorConfig

Public Functions

explicit LogitsPostProcessorConfig(std::optional<LogitsPostProcessorMap> processorMap = std::nullopt, std::optional<LogitsPostProcessorBatched> processorBatched = std::nullopt, bool replicate = true)
std::optional<LogitsPostProcessorMap> getProcessorMap() const
std::optional<LogitsPostProcessorBatched> getProcessorBatched() const
bool getReplicate() const
void setProcessorMap(LogitsPostProcessorMap const &processorMap)
void setProcessorBatched(LogitsPostProcessorBatched const &processorBatched)
void setReplicate(bool replicate)

Private Members

std::optional<LogitsPostProcessorMap> mProcessorMap

mapping from post processor names to non-batched post processors

std::optional<LogitsPostProcessorBatched> mProcessorBatched

single batched post processor

bool mReplicate

If set to true, logits post processor will run on all TP ranks in last PP rank.

class ExecutorConfig
#include <executor.h>

Configuration class for the model executor.

Public Functions

explicit ExecutorConfig(SizeType32 maxBeamWidth = 1, SchedulerConfig schedulerConfig = SchedulerConfig(), KvCacheConfig kvCacheConfig = KvCacheConfig(), bool enableChunkedContext = true, bool normalizeLogProbs = true, SizeType32 iterStatsMaxIterations = kDefaultIterStatsMaxIterations, SizeType32 requestStatsMaxIterations = kDefaultRequestStatsMaxIterations, BatchingType batchingType = BatchingType::kINFLIGHT, std::optional<SizeType32> maxBatchSize = std::nullopt, std::optional<SizeType32> maxNumTokens = std::nullopt, std::optional<ParallelConfig> parallelConfig = std::nullopt, std::optional<PeftCacheConfig> const &peftCacheConfig = std::nullopt, std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig = std::nullopt, std::optional<DecodingConfig> decodingConfig = std::nullopt, float gpuWeightsPercent = 1, std::optional<SizeType32> maxQueueSize = std::nullopt, ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig(), std::optional<DebugConfig> debugConfig = std::nullopt, SizeType32 recvPollPeriodMs = 0, uint64_t maxSeqIdleMicroseconds = kDefaultMaxSeqIdleMicroseconds, std::optional<SpeculativeDecodingConfig> specDecConfig = std::nullopt, std::optional<GuidedDecodingConfig> guidedDecodingConfig = std::nullopt)
SizeType32 getMaxBeamWidth() const
SchedulerConfig getSchedulerConfig() const
KvCacheConfig getKvCacheConfig() const
SchedulerConfig &getSchedulerConfigRef()
KvCacheConfig &getKvCacheConfigRef()
bool getEnableChunkedContext() const
bool getNormalizeLogProbs() const
SizeType32 getIterStatsMaxIterations() const
SizeType32 getRequestStatsMaxIterations() const
BatchingType getBatchingType() const
std::optional<SizeType32> getMaxBatchSize() const
std::optional<SizeType32> getMaxNumTokens() const
std::optional<ParallelConfig> getParallelConfig() const
std::optional<PeftCacheConfig> getPeftCacheConfig() const
std::optional<LogitsPostProcessorConfig> getLogitsPostProcessorConfig() const
std::optional<DecodingConfig> getDecodingConfig() const
float getGpuWeightsPercent() const
std::optional<SizeType32> getMaxQueueSize() const
ExtendedRuntimePerfKnobConfig getExtendedRuntimePerfKnobConfig() const
std::optional<DebugConfig> getDebugConfig() const
SizeType32 getRecvPollPeriodMs() const
uint64_t getMaxSeqIdleMicroseconds() const
std::optional<SpeculativeDecodingConfig> getSpecDecConfig() const
std::optional<GuidedDecodingConfig> getGuidedDecodingConfig() const
void setMaxBeamWidth(SizeType32 maxBeamWidth)
void setMaxBatchSize(SizeType32 maxBatchSize)
void setMaxNumTokens(SizeType32 maxNumTokens)
void setSchedulerConfig(SchedulerConfig const &schedulerConfig)
void setKvCacheConfig(KvCacheConfig const &kvCacheConfig)
void setEnableChunkedContext(bool enableChunkedContext)
void setNormalizeLogProbs(bool normalizeLogProbs)
void setIterStatsMaxIterations(SizeType32 iterStatsMaxIterations)
void setRequestStatsMaxIterations(SizeType32 requestStatsMaxIterations)
void setBatchingType(BatchingType batchingType)
void setParallelConfig(ParallelConfig const &parallelConfig)
void setPeftCacheConfig(PeftCacheConfig const &peftCacheConfig)
void setLogitsPostProcessorConfig(LogitsPostProcessorConfig const &logitsPostProcessorConfig)
void setDecodingConfig(DecodingConfig const &decodingConfig)
void setGpuWeightsPercent(float const &gpuWeightsPercent)
void setMaxQueueSize(std::optional<SizeType32> const &maxQueueSize)
void setExtendedRuntimePerfKnobConfig(ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig)
void setDebugConfig(DebugConfig const &debugConfig)
void setRecvPollPeriodMs(SizeType32 const &recvPollPeriodMs)
void setMaxSeqIdleMicroseconds(uint64_t maxNumTokens)
void setSpecDecConfig(SpeculativeDecodingConfig const &specDecConfig)
void setGuidedDecodingConfig(GuidedDecodingConfig const &guidedDecodingConfig)

Public Static Attributes

static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000
static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000
static constexpr SizeType32 kDefaultRequestStatsMaxIterations = 0

Private Members

SizeType32 mMaxBeamWidth

The beam width value of requests that will be sent to the executor.

SchedulerConfig mSchedulerConfig

The scheduler configuration.

KvCacheConfig mKvCacheConfig

The KV cache configuration.

bool mEnableChunkedContext

The KV cache configuration.

bool mNormalizeLogProbs

Controls if log probabilities should be normalized or not.

SizeType32 mIterStatsMaxIterations

Controls the maximum number of iterations for which to keep statistics.

SizeType32 mRequestStatsMaxIterations

Controls the maximum number of iterations for which to keep per-request statistics.

BatchingType mBatchingType

The type of batching strategy to use. See BatchingType.

std::optional<SizeType32> mMaxBatchSize

The max batch size of requests.

std::optional<SizeType32> mMaxNumTokens

The max number of tokens per batch.

std::optional<ParallelConfig> mParallelConfig

The parallel execution configuration.

std::optional<PeftCacheConfig> mPeftCacheConfig
std::optional<LogitsPostProcessorConfig> mLogitsPostProcessorConfig

Logits post processor configuration.

std::optional<DecodingConfig> mDecodingConfig

Decoding configuration.

float mGpuWeightsPercent

GPU weights percent for weight streaming.

std::optional<SizeType32> mMaxQueueSize

The maximum number of requests allowed in queue before rejecting new requests.

ExtendedRuntimePerfKnobConfig mExtendedRuntimePerfKnobConfig

Config for perf knobs that can be set in runtime.

std::optional<DebugConfig> mDebugConfig

Debugging configuration.

SizeType32 mRecvPollPeriodMs

The time in ms between polls for new communication in orchestrator mode. Use 0 for busy loop.

uint64_t mMaxSeqIdleMicroseconds

The maximum time in microseconds a scheduled request can remain idle before getting terminated. Default is 3 minutes.

std::optional<SpeculativeDecodingConfig> mSpeculativeDecodingConfig

The speculative decoding configuration.

std::optional<GuidedDecodingConfig> mGuidedDecodingConfig

The guided decoding configuration.

Friends

friend class Serialization
struct KVCacheCreatedData

Public Members

std::vector<SizeType32> numBlocksPerCacheLevel

The amount of blocks at each cache level.

struct KVCacheStoredBlockData
#include <executor.h>

An entry for a single block stored into the tree.

Public Functions

inline KVCacheStoredBlockData(IdType blockHash, tensorrt_llm::runtime::VecUniqueTokens tokens, tensorrt_llm::runtime::LoraTaskIdType loraId, SizeType32 cacheLevel, SizeType32 priority)

Public Members

IdType blockHash

The hash of the block.

tensorrt_llm::runtime::VecUniqueTokens tokens

The unique tokens of the block.

tensorrt_llm::runtime::LoraTaskIdType loraId

The Lora task id of the block.

SizeType32 cacheLevel

The cache level of the block.

SizeType32 priority

The priority of the block.

struct KVCacheStoredData

Public Members

std::optional<IdType> parentHash

The parent of this sequence of stored blocks.

std::vector<KVCacheStoredBlockData> blocks

A sequence of blocks. The parent of block i is block i-1

struct KVCacheRemovedData

Public Members

std::vector<IdType> blockHashes

The hashes of blocks being removed.

template<typename T>
struct KVCacheEventDiff

Public Members

T oldValue
T newValue
struct KVCacheUpdatedData

Public Functions

inline explicit KVCacheUpdatedData(IdType blockHash)
inline KVCacheUpdatedData &cacheLevelUpdated(SizeType32 oldValue, SizeType32 newValue)
inline KVCacheUpdatedData &priorityUpdated(SizeType32 oldValue, SizeType32 newValue)

Public Members

IdType blockHash

The hash of the updated block.

std::optional<KVCacheEventDiff<SizeType32>> cacheLevel = std::nullopt

The updated value of the cacheLevel field.

std::optional<KVCacheEventDiff<SizeType32>> priority = std::nullopt

The updated value of the priority field.

struct KVCacheEvent

Public Functions

KVCacheEvent(IdType eventId, KVCacheEventData data)

Public Members

IdType eventId

The unique id of this event.

KVCacheEventData data

The data corresponding to this event.

class KVCacheEventManager
#include <executor.h>

Exposes a limited set of KV cache manager functionalities.

Public Functions

KVCacheEventManager(std::shared_ptr<tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager> kvCacheManager)
std::deque<KVCacheEvent> getLatestEvents(std::optional<std::chrono::milliseconds> timeout = std::nullopt)

Get the latest KV Cache events.

Parameters:

timeout – The maximum time to wait for new events. If nullopt, will only return when new events are available, or when the executor instance has shutdown.

Private Members

std::shared_ptr<tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager> kvCacheManager
class Executor
#include <executor.h>

The executor is responsible for receiving new requests and sending responses, and running the inference.

Public Functions

Executor(std::filesystem::path const &modelPath, ModelType modelType, ExecutorConfig const &executorConfig)
Parameters:
  • modelPath – Path to the folder that defines the model to run

  • modelType – The type of model

  • executorConfig – The configuration for the executor

  • comm – An optional inter-process communicator configuration

Executor(std::filesystem::path const &encoderModelPath, std::filesystem::path const &decoderModelPath, ModelType modelType, ExecutorConfig const &executorConfig)
Executor(BufferView const &engineBuffer, std::string const &jsonConfigStr, ModelType modelType, ExecutorConfig const &executorConfig, std::optional<std::map<std::string, Tensor>> const &managedWeights = std::nullopt)
Executor(BufferView const &encoderEngineBuffer, std::string const &encoderJsonConfigStr, BufferView const &decoderEngineBuffer, std::string const &decoderJsonConfigStr, ModelType modelType, ExecutorConfig const &executorConfig)
Executor(std::shared_ptr<Model> model, ExecutorConfig const &executorConfig)
Executor(std::shared_ptr<Model> encoderModel, std::shared_ptr<Model> decoderModel, ExecutorConfig const &executorConfig)
~Executor()
Executor(Executor const &executor) = delete
Executor &operator=(Executor const &executor) = delete
Executor(Executor&&) = default
Executor &operator=(Executor&&) = default
IdType enqueueRequest(Request const &request)

Enqueue a new request.

Parameters:

request – The LLM request which contains input tokens and request parameters

Returns:

A unique id that identifies the request

std::vector<IdType> enqueueRequests(std::vector<Request> const &requests)

Enqueue a batch of request.

std::vector<Response> awaitResponses(std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)

Await for ready responses.

   This overload awaits for any ready responses. In particular, if several requests
   have been enqueued, this method will provide any ready responses without order guarantees.

Parameters:

timeout – The maximum time to wait for new responses

Returns:

A vector of responses

std::vector<Response> awaitResponses(IdType const &requestId, std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)

Await for ready responses.

Parameters:
  • id – A request id

  • timeout – The maximum time to wait for new responses

Returns:

A vector of responses

std::vector<std::vector<Response>> awaitResponses(std::vector<IdType> const &requestIds, std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)

Await for multiple ready responses.

   A multiple ID request behaves as if awaitResponses(IdType, timeout)
   were invoked on all IDs. The returned vector contains
   a vector of responses per ID in the same order specified by the requestIds.
   The same behaviour as awaitResponses(IdType, timeout) applies:
   * Responses may be empty.
   * If all responses have already been given for one of the requestIds,
     then this method will hang unless a timeout is specified.

Parameters:
  • requestIds – Ids requested

  • timeout – The maximum time to wait for new responses

Returns:

A vector of vector of responses

SizeType32 getNumResponsesReady(std::optional<IdType> const &requestId = std::nullopt) const

Get the number of ready responses.

Parameters:

requestId – An optional request id

Returns:

The number of ready responses

void cancelRequest(IdType requestId)

Cancel the request with provided request id.

Parameters:

id – The request id for which to cancel the response

void shutdown()

Signals the server to shutdown.

This call is blocking. Only returns when all requests have terminated or timeout has been reached

std::deque<IterationStats> getLatestIterationStats()

Returns the per-iterations statistics computed since last call to getLatestIterationStats. Contains at most iterStatsMaxIterations iterations.

Returns:

Iteration stats

std::deque<RequestStatsPerIteration> getLatestRequestStats()

Returns the request stats of each iteration computed since last call to getLatestRequestStats. Contains at most requestStatsMaxIterations iterations.

Returns:

Request stats grouped by iterations

std::deque<DebugTensorsPerIteration> getLatestDebugTensors()

Returns the debug tensors of each iteration computed since last call to getLatestDebugTensors. Contains at most debugTensorsMaxIterations iterations.

Returns:

Request debug tensors grouped by iterations

bool canEnqueueRequests() const

Indicates if the current process is allowed to enqueueRequests.

bool isParticipant() const

Indicates if the current process participates in this executor instance.

std::optional<std::shared_ptr<KVCacheEventManager>> getKVCacheEventManager() const

Private Members

std::unique_ptr<Impl> mImpl
class JsonSerialization
#include <executor.h>

Class with utility functions to serialize statistics to json string.

Public Static Functions

static std::string toJsonStr(IterationStats const &iterationStats)

Utility function to convert an iterationStats struct to a json serialized string.

static std::string toJsonStr(RequestStatsPerIteration const &requestStatsPerIter)

Utility function to convert a requestStatsPerIteration struct to a json serialized string.

static std::string toJsonStr(RequestStats const &requestStats)

Utility function to convert a requestStats struct to a json serialized string.

namespace mpi

serialization.h

namespace tensorrt_llm
namespace executor
class Serialization

Public Static Functions

static RequestPerfMetrics::TimePoint deserializeTimePoint(std::istream &is)
static void serialize(RequestPerfMetrics::TimePoint const &tp, std::ostream &os)
static size_t serializedSize(RequestPerfMetrics::TimePoint const&)
static RequestPerfMetrics deserializeRequestPerfMetrics(std::istream &is)
static void serialize(RequestPerfMetrics const &metrics, std::ostream &os)
static size_t serializedSize(RequestPerfMetrics const &metrics)
static SamplingConfig deserializeSamplingConfig(std::istream &is)
static void serialize(SamplingConfig const &config, std::ostream &os)
static size_t serializedSize(SamplingConfig const &config)
static OutputConfig deserializeOutputConfig(std::istream &is)
static void serialize(OutputConfig const &config, std::ostream &os)
static size_t serializedSize(OutputConfig const &config)
static ExternalDraftTokensConfig deserializeExternalDraftTokensConfig(std::istream &is)
static void serialize(ExternalDraftTokensConfig const &config, std::ostream &os)
static size_t serializedSize(ExternalDraftTokensConfig const &config)
static PromptTuningConfig deserializePromptTuningConfig(std::istream &is)
static void serialize(PromptTuningConfig const &config, std::ostream &os)
static size_t serializedSize(PromptTuningConfig const &config)
static MropeConfig deserializeMropeConfig(std::istream &is)
static void serialize(MropeConfig const &config, std::ostream &os)
static size_t serializedSize(MropeConfig const &config)
static LoraConfig deserializeLoraConfig(std::istream &is)
static void serialize(LoraConfig const &config, std::ostream &os)
static size_t serializedSize(LoraConfig const &config)
static kv_cache::CommState deserializeCommState(std::istream &is)
static void serialize(kv_cache::CommState const &state, std::ostream &os)
static size_t serializedSize(kv_cache::CommState const &state)
static kv_cache::SocketState deserializeSocketState(std::istream &is)
static void serialize(kv_cache::SocketState const &state, std::ostream &os)
static size_t serializedSize(kv_cache::SocketState const &state)
static kv_cache::CacheState deserializeCacheState(std::istream &is)
static void serialize(kv_cache::CacheState const &state, std::ostream &os)
static size_t serializedSize(kv_cache::CacheState const &state)
static DataTransceiverState deserializeDataTransceiverState(std::istream &is)
static void serialize(DataTransceiverState const &dataTransceiverState, std::ostream &os)
static size_t serializedSize(DataTransceiverState const &dataTransceiverState)
static ContextPhaseParams deserializeContextPhaseParams(std::istream &is)
static void serialize(ContextPhaseParams const &contextPhaseParams, std::ostream &os)
static size_t serializedSize(ContextPhaseParams const &contextPhaseParams)
static Request deserializeRequest(std::istream &is)
static void serialize(Request const &request, std::ostream &os)
static size_t serializedSize(Request const &request)
static Tensor deserializeTensor(std::istream &is)
static void serialize(Tensor const &tensor, std::ostream &os)
static size_t serializedSize(Tensor const &tensor)
static SpeculativeDecodingFastLogitsInfo deserializeSpecDecFastLogitsInfo(std::istream &is)
static void serialize(SpeculativeDecodingFastLogitsInfo const &info, std::ostream &os)
static size_t serializedSize(SpeculativeDecodingFastLogitsInfo const &info)
static Result deserializeResult(std::istream &is)
static void serialize(Result const &result, std::ostream &os)
static size_t serializedSize(Result const &result)
static Response deserializeResponse(std::istream &is)
static void serialize(Response const &response, std::ostream &os)
static size_t serializedSize(Response const &response)
static std::vector<Response> deserializeResponses(std::vector<char> &buffer)
static std::vector<char> serialize(std::vector<Response> const &responses)
static KvCacheConfig deserializeKvCacheConfig(std::istream &is)
static void serialize(KvCacheConfig const &kvCacheConfig, std::ostream &os)
static size_t serializedSize(KvCacheConfig const &kvCacheConfig)
static DynamicBatchConfig deserializeDynamicBatchConfig(std::istream &is)
static void serialize(DynamicBatchConfig const &dynamicBatchConfig, std::ostream &os)
static size_t serializedSize(DynamicBatchConfig const &dynamicBatchConfig)
static SchedulerConfig deserializeSchedulerConfig(std::istream &is)
static void serialize(SchedulerConfig const &schedulerConfig, std::ostream &os)
static size_t serializedSize(SchedulerConfig const &schedulerConfig)
static ExtendedRuntimePerfKnobConfig deserializeExtendedRuntimePerfKnobConfig(std::istream &is)
static void serialize(ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig, std::ostream &os)
static size_t serializedSize(ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig)
static ParallelConfig deserializeParallelConfig(std::istream &is)
static void serialize(ParallelConfig const &parallelConfig, std::ostream &os)
static size_t serializedSize(ParallelConfig const &parallelConfig)
static PeftCacheConfig deserializePeftCacheConfig(std::istream &is)
static void serialize(PeftCacheConfig const &peftCacheConfig, std::ostream &os)
static size_t serializedSize(PeftCacheConfig const &peftCacheConfig)
static OrchestratorConfig deserializeOrchestratorConfig(std::istream &is)
static void serialize(OrchestratorConfig const &orchestratorConfig, std::ostream &os)
static size_t serializedSize(OrchestratorConfig const &orchestratorConfig)
static DecodingMode deserializeDecodingMode(std::istream &is)
static void serialize(DecodingMode const &decodingMode, std::ostream &os)
static size_t serializedSize(DecodingMode const &decodingMode)
static LookaheadDecodingConfig deserializeLookaheadDecodingConfig(std::istream &is)
static void serialize(LookaheadDecodingConfig const &lookaheadDecodingConfig, std::ostream &os)
static size_t serializedSize(LookaheadDecodingConfig const &lookaheadDecodingConfig)
static EagleConfig deserializeEagleConfig(std::istream &is)
static void serialize(EagleConfig const &eagleConfig, std::ostream &os)
static size_t serializedSize(EagleConfig const &eagleConfig)
static SpeculativeDecodingConfig deserializeSpeculativeDecodingConfig(std::istream &is)
static void serialize(SpeculativeDecodingConfig const &specDecConfig, std::ostream &os)
static size_t serializedSize(SpeculativeDecodingConfig const &specDecConfig)
static GuidedDecodingConfig deserializeGuidedDecodingConfig(std::istream &is)
static void serialize(GuidedDecodingConfig const &guidedDecodingConfig, std::ostream &os)
static size_t serializedSize(GuidedDecodingConfig const &guidedDecodingConfig)
static GuidedDecodingParams deserializeGuidedDecodingParams(std::istream &is)
static void serialize(GuidedDecodingParams const &guidedDecodingParams, std::ostream &os)
static size_t serializedSize(GuidedDecodingParams const &guidedDecodingParams)
static KvCacheRetentionConfig deserializeKvCacheRetentionConfig(std::istream &is)
static void serialize(KvCacheRetentionConfig const &kvCacheRetentionConfig, std::ostream &os)
static size_t serializedSize(KvCacheRetentionConfig const &kvCacheRetentionConfig)
static KvCacheRetentionConfig::TokenRangeRetentionConfig deserializeTokenRangeRetentionConfig(std::istream &is)
static void serialize(KvCacheRetentionConfig::TokenRangeRetentionConfig const &tokenRangeRetentionConfig, std::ostream &os)
static size_t serializedSize(KvCacheRetentionConfig::TokenRangeRetentionConfig const &tokenRangeRetentionConfig)
static DecodingConfig deserializeDecodingConfig(std::istream &is)
static void serialize(DecodingConfig const &decodingConfig, std::ostream &os)
static size_t serializedSize(DecodingConfig const &decodingConfig)
static DebugConfig deserializeDebugConfig(std::istream &is)
static void serialize(DebugConfig const &debugConfig, std::ostream &os)
static size_t serializedSize(DebugConfig const &debugConfig)
static ExecutorConfig deserializeExecutorConfig(std::istream &is)
static void serialize(ExecutorConfig const &executorConfig, std::ostream &os)
static size_t serializedSize(ExecutorConfig const &executorConfig)
static KvCacheStats deserializeKvCacheStats(std::istream &is)
static void serialize(KvCacheStats const &kvCacheStats, std::ostream &os)
static size_t serializedSize(KvCacheStats const &kvCacheStats)
static StaticBatchingStats deserializeStaticBatchingStats(std::istream &is)
static void serialize(StaticBatchingStats const &staticBatchingStats, std::ostream &os)
static size_t serializedSize(StaticBatchingStats const &staticBatchingStats)
static InflightBatchingStats deserializeInflightBatchingStats(std::istream &is)
static void serialize(InflightBatchingStats const &inflightBatchingStats, std::ostream &os)
static size_t serializedSize(InflightBatchingStats const &inflightBatchingStats)
static IterationStats deserializeIterationStats(std::vector<char> &buffer)
static IterationStats deserializeIterationStats(std::istream &is)
static void serialize(IterationStats const &iterStats, std::ostream &os)
static std::vector<char> serialize(IterationStats const &iterStats)
static size_t serializedSize(IterationStats const &iterStats)
static std::vector<char> serialize(std::vector<IterationStats> const &iterStatsVec)
static std::vector<IterationStats> deserializeIterationStatsVec(std::vector<char> &buffer)
static DisServingRequestStats deserializeDisServingRequestStats(std::istream &is)
static void serialize(DisServingRequestStats const &stats, std::ostream &os)
static size_t serializedSize(DisServingRequestStats const &disServingRequestStats)
static RequestStage deserializeRequestStage(std::istream &is)
static void serialize(RequestStage const &requestStage, std::ostream &os)
static size_t serializedSize(RequestStage const &requestStage)
static RequestStats deserializeRequestStats(std::istream &is)
static void serialize(RequestStats const &state, std::ostream &os)
static size_t serializedSize(RequestStats const &state)
static RequestStatsPerIteration deserializeRequestStatsPerIteration(std::istream &is)
static RequestStatsPerIteration deserializeRequestStatsPerIteration(std::vector<char> &buffer)
static void serialize(RequestStatsPerIteration const &state, std::ostream &os)
static std::vector<char> serialize(RequestStatsPerIteration const &state)
static size_t serializedSize(RequestStatsPerIteration const &state)
static std::vector<char> serialize(std::vector<RequestStatsPerIteration> const &requestStatsVec)
static std::vector<RequestStatsPerIteration> deserializeRequestStatsPerIterationVec(std::vector<char> &buffer)
static std::string deserializeString(std::istream &is)
static bool deserializeBool(std::istream &is)
static ModelType deserializeModelType(std::istream &is)
namespace kv_cache