Executor
types.h
-
template<>
struct TypeTraits<std::int8_t>
-
template<>
struct TypeTraits<std::int32_t>
-
template<>
struct TypeTraits<std::int64_t>
-
template<>
struct TypeTraits<std::uint8_t>
-
namespace tensorrt_llm
-
namespace executor
Typedefs
-
using SizeType32 = std::int32_t
-
using FloatType = float
-
using TokenIdType = std::int32_t
-
using VecTokens = std::vector<TokenIdType>
-
using IdType = std::uint64_t
-
using IterationType = std::uint64_t
-
using RandomSeedType = std::uint64_t
-
using StreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>
-
using MillisecondsType = std::chrono::milliseconds
-
using LogitsPostProcessor = std::function<void(IdType, Tensor&, BeamTokens const&, StreamPtr const&, std::optional<IdType>)>
-
using LogitsPostProcessorMap = std::unordered_map<std::string, LogitsPostProcessor>
-
using LogitsPostProcessorBatched = std::function<void(std::vector<IdType> const&, std::vector<Tensor>&, std::vector<std::reference_wrapper<BeamTokens const>> const&, StreamPtr const&, std::vector<std::optional<IdType>> const&)>
-
using MedusaChoices = std::vector<std::vector<SizeType32>>
-
using EagleChoices = std::vector<std::vector<SizeType32>>
-
using PriorityType = float
-
using BufferView = std::basic_string_view<uint8_t>
Enums
-
enum class DataType
Values:
-
enumerator kBOOL
-
enumerator kUINT8
-
enumerator kINT8
-
enumerator kINT32
-
enumerator kINT64
-
enumerator kBF16
-
enumerator kFP8
-
enumerator kFP16
-
enumerator kFP32
-
enumerator kUNKNOWN
-
enumerator kBOOL
-
enum class RequestType
Values:
-
enumerator REQUEST_TYPE_CONTEXT_AND_GENERATION
-
enumerator REQUEST_TYPE_CONTEXT_ONLY
-
enumerator REQUEST_TYPE_GENERATION_ONLY
-
enumerator REQUEST_TYPE_CONTEXT_AND_GENERATION
-
enum class MemoryType
Values:
-
enumerator kCPU
-
enumerator kCPU_PINNED
-
enumerator kCPU_PINNEDPOOL
-
enumerator kGPU
-
enumerator kUVM
-
enumerator kUNKNOWN
-
enumerator kCPU
-
enum class ModelType
Values:
-
enumerator kDECODER_ONLY
-
enumerator kENCODER_ONLY
-
enumerator kENCODER_DECODER
-
enumerator kDECODER_ONLY
-
enum class BatchingType
The batching type.
Values:
-
enumerator kSTATIC
STATIC refers to the traditional batching scheme with a batch of requests running in lockstep until the full generation for all of them is complete. Requests in a batch are all padded up to the maximum input and output sequence length of any member of the batch.
-
enumerator kINFLIGHT
INFLIGHT refers to a scheme where newly arrived requests are dynamically incorporated into the batch under execution, and requests are returned as soon as the end condition is met without any padding.
-
enumerator kSTATIC
-
enum class CapacitySchedulerPolicy
The policy used to select the subset of available requests in each iteration of the executor generation loop.
Values:
-
enumerator kMAX_UTILIZATION
MAX_UTILIZATION packs as many requests as the underlying TRT engine can support in any iteration of the InflightBatching generation loop. While this is expected to maximize GPU throughput, it might require that some requests be paused and restarted depending on peak KV cache memory availability.
-
enumerator kGUARANTEED_NO_EVICT
GUARANTEED_NO_EVICT uses KV cache more conservatively guaranteeing that a request, once started, will run to completion without eviction.
-
enumerator kSTATIC_BATCH
kSTATIC_BATCH does not schedule new requests until all requests in current batch are completed. Similar to kGUARANTEED_NO_EVICT, requests will run to completion without eviction.
-
enumerator kMAX_UTILIZATION
-
enum class ContextChunkingPolicy
Values:
-
enumerator kFIRST_COME_FIRST_SERVED
Sequential chunking, complete the unfinished context phase first.
-
enumerator kEQUAL_PROGRESS
Iterate through each context request in sequence and attempt to increase its chunk count until the constraint is exceeded.
-
enumerator kFIRST_COME_FIRST_SERVED
-
enum class RequestStage
Enum class that represents the state of a request.
Values:
-
enumerator kQUEUED
Request that have been received but not yet included in the active requests (due to constraints such as maximum batch size for example).
-
enumerator kENCODER_IN_PROGRESS
Active request in encoder phase.
-
enumerator kCONTEXT_IN_PROGRESS
Active request in context phase.
-
enumerator kGENERATION_IN_PROGRESS
Active request in generation phase.
-
enumerator kGENERATION_COMPLETE
Active request for which generation has completed.
-
enumerator kQUEUED
-
enum class FinishReason
The reason why the model stopped generating tokens for a request.
Values:
-
enumerator kNOT_FINISHED
The request is not finished.
-
enumerator kEND_ID
The request finished because the end id was generated.
-
enumerator kSTOP_WORDS
The request finished because a stop word was generated.
-
enumerator kLENGTH
The request finished because the maximum number of tokens was reached.
-
enumerator kTIMED_OUT
The request finished because it got timed out (via the mAllotedTime parameter)
-
enumerator kCANCELLED
The request was cancelled by calling cancelRequest.
-
enumerator kNOT_FINISHED
Functions
-
std::ostream &operator<<(std::ostream &os, CapacitySchedulerPolicy policy)
-
std::ostream &operator<<(std::ostream &os, ContextChunkingPolicy policy)
-
template<typename T, bool = false>
struct TypeTraits - #include <types.h>
For converting a C++ data type to a
TrtLmmDataType
.
-
template<>
struct TypeTraits<float>
-
template<>
struct TypeTraits<half>
- template<> int8_t >
- template<> int32_t >
- template<> int64_t >
-
template<>
struct TypeTraits<bool>
- template<> uint8_t >
-
struct KvCacheStats
- #include <types.h>
Struct that holds the stats of a KV cache manager.
Public Members
-
SizeType32 maxNumBlocks
Max number of blocks.
-
SizeType32 freeNumBlocks
Number of free blocks.
-
SizeType32 usedNumBlocks
Number of used blocks.
-
SizeType32 tokensPerBlock
Number of tokens per block.
-
SizeType32 allocTotalBlocks
Number of total allocated block.
-
SizeType32 allocNewBlocks
Number of newly allocated block.
-
SizeType32 reusedBlocks
Number of reused block.
-
SizeType32 missedBlocks
Number of not reused block.
-
float cacheHitRate
Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).
-
SizeType32 maxNumBlocks
-
struct StaticBatchingStats
- #include <types.h>
Struct that holds the stats of static batching models for a single iteration.
Public Members
-
SizeType32 numScheduledRequests
Number of scheduled requests.
-
SizeType32 numContextRequests
Number of requests in context stage.
-
SizeType32 numCtxTokens
Total number of context tokens in the iteration.
-
SizeType32 numGenTokens
Total number of tokens to generate in the iteration.
-
SizeType32 emptyGenSlots
Total number of unused generation token slots.
-
SizeType32 numScheduledRequests
-
struct InflightBatchingStats
- #include <types.h>
Struct that holds the stats of inflight batching models for a single iteration.
Public Members
-
SizeType32 numScheduledRequests
Number of scheduled requests.
-
SizeType32 numContextRequests
Number of requests in context stage.
-
SizeType32 numGenRequests
Number of requests in generation stage.
-
SizeType32 numPausedRequests
Number of paused requests.
-
SizeType32 numCtxTokens
Total number of context tokens in the iteration.
-
SizeType32 microBatchId
Index of mirco batch.
-
float avgNumDecodedTokensPerIter
Average number of tokens decoded per request per iteration.
-
SizeType32 numScheduledRequests
-
struct IterationStats
- #include <types.h>
Struct that holds the stats of a single iteration.
Public Members
-
std::string timestamp
Ending time of this iteration.
-
IterationType iter
Iteration id.
-
double iterLatencyMS
Iteration latency (ms)
-
double newActiveRequestsQueueLatencyMS
The total time spent in queue by the requests that became active in this iteration (ms)
-
SizeType32 numNewActiveRequests
Number of new fetched active requests.
-
SizeType32 numActiveRequests
Number of active requests.
-
SizeType32 numQueuedRequests
Number of queued requests.
-
SizeType32 numCompletedRequests
Number of requests that were completed in this iteration.
-
SizeType32 maxNumActiveRequests
Number of max active requests.
-
SizeType32 maxBatchSizeStatic
Static max batch size passed to the executor.
-
SizeType32 maxBatchSizeTunerRecommended
Batch size produced by dynamic tuner based on input stats.
-
SizeType32 maxBatchSizeRuntime
@brife The min of maxBatchSizeStatic and maxBatchSizeRuntimeUpperbound
-
SizeType32 maxNumTokensStatic
@brife Static max num tokens passed to the executor
-
SizeType32 maxNumTokensTunerRecommended
@brife Max num tokens produced by dynamic tuner based on input stats
-
SizeType32 maxNumTokensRuntime
@brife The runtime max num tokens
-
size_t gpuMemUsage
GPU memory usage in bytes.
-
size_t cpuMemUsage
CPU memory usage in bytes.
-
size_t pinnedMemUsage
Pinned memory usage in bytes.
-
std::optional<KvCacheStats> kvCacheStats
Stats specific to KV caches.
-
std::optional<KvCacheStats> crossKvCacheStats
Stats specific to cross KV caches.
-
std::optional<StaticBatchingStats> staticBatchingStats
Stats specific to static batching.
-
std::optional<InflightBatchingStats> inflightBatchingStats
Stats specific to inflight batching.
-
std::string timestamp
-
struct DisServingRequestStats
- #include <types.h>
Struct that holds the request stats in the case of disaggregated serving.
Public Members
-
double kvCacheTransferMS
The total time spent on transferring KV cache from context phase to generation phase (ms)
-
double kvCacheTransferMS
-
struct RequestStats
- #include <types.h>
Struct that holds the stats of a single request.
Public Members
-
RequestStage stage
The current stage the request is in.
-
SizeType32 contextPrefillPosition
If using chunked context, the current context prefill position.
-
SizeType32 numGeneratedTokens
The number of generated tokens so far.
-
float avgNumDecodedTokensPerIter
The average number of decoded tokens per iteration. It is >= 1 for speculative decoding.
-
bool scheduled
Whether the request is scheduled for the current iteration.
-
bool paused
Whether the request is being paused at the current iteration due to lack of resources (KV cache blocks exhaustion for example)
-
std::optional<DisServingRequestStats> disServingStats
Stats specific to disaggregated serving.
-
SizeType32 allocTotalBlocksPerRequest
Number of total allocated blocks per request.
-
SizeType32 allocNewBlocksPerRequest
Number of newly allocated blocks per request.
-
SizeType32 reusedBlocksPerRequest
Number of reused blocks per request.
-
SizeType32 missedBlocksPerRequest
Number of missed blocks per request.
-
SizeType32 kvCacheHitRatePerRequest
KV Cache Hit Rate per request, defined as reusedBlocks / (reusedBlocks + missedBlocks)
-
RequestStage stage
-
struct RequestStatsPerIteration
- #include <types.h>
Struct that holds the stats of all requests in an iteration.
Public Members
-
IterationType iter
The iteration id for these stats.
-
std::vector<RequestStats> requestStats
The stats of all active requests for this iteration.
-
IterationType iter
-
struct RequestPerfMetrics
- #include <types.h>
Struct that holds the stats of a request.
Public Types
-
using TimePoint = std::chrono::time_point<std::chrono::steady_clock>
Public Members
-
TimingMetrics timingMetrics
-
KvCacheMetrics kvCacheMetrics
-
std::optional<IterationType> firstIter
First iteration where the request was processed.
-
std::optional<IterationType> lastIter
Last iteration where a token was generated.
-
std::optional<IterationType> iter
Current iteration.
-
struct KvCacheMetrics
Public Members
-
SizeType32 numTotalAllocatedBlocks = {0}
Number of total allocated blocks.
-
SizeType32 numNewAllocatedBlocks = {0}
Number of newly allocated blocks.
-
SizeType32 numReusedBlocks = {0}
Number of reused blocks.
-
SizeType32 numMissedBlocks = {0}
Number of missed blocks.
-
SizeType32 kvCacheHitRate = {0}
KV Cache Hit Rate, defined as reusedBlocks / (reusedBlocks + missedBlocks)
-
SizeType32 numTotalAllocatedBlocks = {0}
-
struct TimingMetrics
Public Members
-
using TimePoint = std::chrono::time_point<std::chrono::steady_clock>
-
struct DebugTensorsPerIteration
- #include <types.h>
Struct that holds the debug tensors in an iteration.
Public Members
-
IterationType iter
The iteration id for these tensors.
-
IterationType iter
-
class DecodingMode
- #include <types.h>
mode of the decoder
Public Types
-
using UnderlyingType = uint32_t
Public Functions
-
inline constexpr auto useTemperature(bool useTemp)
-
inline constexpr auto useOccurrencePenalties(bool usePenalty)
-
inline constexpr auto usePresencePenalty(bool usePenalty)
-
inline constexpr auto useRepetitionPenalty(bool usePenalty)
-
inline constexpr auto useFrequencyPenalty(bool usePenalty)
-
inline constexpr auto useMinLength(bool useMinLen)
-
inline constexpr auto useBanTokens(bool banTokens)
-
inline constexpr auto useBanWords(bool banWords)
-
inline constexpr auto useNoRepeatNgramSize(bool noRepeatNgramSize)
-
inline constexpr auto useStopWords(bool stopWords)
-
inline constexpr auto useMaxLengthStop(bool maxLengthStop)
-
inline constexpr auto useExplicitEosStop(bool explicitEosStop)
-
inline constexpr bool isAuto() const
-
inline constexpr bool isTopK() const
-
inline constexpr bool isTopP() const
-
inline constexpr bool isTopKorTopP() const
-
inline constexpr bool isTopKandTopP() const
-
inline constexpr bool isBeamSearch() const
-
inline constexpr bool isMedusa() const
-
inline constexpr bool isLookahead() const
-
inline constexpr bool isExplicitDraftTokens() const
-
inline constexpr bool isExternalDraftTokens() const
-
inline constexpr bool isEagle() const
-
inline constexpr bool isUseTemperature() const
-
inline constexpr bool isUsePresencePenalty() const
-
inline constexpr bool isUseFrequencyPenalty() const
-
inline constexpr bool isUseRepetitionPenalty() const
-
inline constexpr bool isUseMinLength() const
-
inline constexpr bool isUseOccurrencePenalty() const
-
inline constexpr bool isUsePenalty() const
-
inline constexpr bool isUseBanWords() const
-
inline constexpr bool isUseNoRepeatNgramSize() const
-
inline constexpr bool isUseBanTokens() const
-
inline constexpr bool isUseStopWords() const
-
inline constexpr bool isUseMaxLengthStop() const
-
inline constexpr bool isUseExplicitEosStop() const
-
inline constexpr bool isUseStopCriteria() const
-
inline bool operator==(DecodingMode const &other) const
-
inline explicit constexpr DecodingMode(UnderlyingType state)
-
inline constexpr UnderlyingType getState() const
Public Static Functions
-
static inline constexpr auto Auto()
No mode specified. Config will be determined from the beam width of the first request at runtime TopKTopP if beamWidth == 1, BeamSearch otherwise.
-
static inline constexpr auto TopK()
-
static inline constexpr auto TopP()
-
static inline constexpr auto TopKTopP()
-
static inline constexpr auto BeamSearch()
-
static inline constexpr auto Medusa()
-
static inline constexpr auto Lookahead()
-
static inline constexpr auto ExplicitDraftTokens()
-
static inline constexpr auto ExternalDraftTokens()
-
static inline constexpr auto Eagle()
Private Functions
-
inline constexpr bool anyBitSet(UnderlyingType bits) const
-
inline constexpr bool allBitSet(UnderlyingType bits) const
-
inline constexpr UnderlyingType setBitTo(UnderlyingType state, bool x)
Private Members
-
UnderlyingType mState = {}
Private Static Attributes
-
static constexpr UnderlyingType kUseRepetitionPenalties = {1u << 0}
-
static constexpr UnderlyingType kUseFrequencyPenalties = {1u << 1}
-
static constexpr UnderlyingType kUsePresencePenalties = {1u << 2}
-
static constexpr UnderlyingType kUseTemperature = {1u << 3}
-
static constexpr UnderlyingType kUseMinLength = {1u << 4}
-
static constexpr UnderlyingType kUseBanWords = {1u << 5}
-
static constexpr UnderlyingType kUseStopWords = {1u << 6}
-
static constexpr UnderlyingType kUseMaxLengthStop = {1u << 7}
-
static constexpr UnderlyingType kUseExplicitEosStop = {1u << 8}
-
static constexpr UnderlyingType kUseNoRepeatNgramSize = {1u << 9}
-
static constexpr UnderlyingType kStandardStopCriteria = {kUseStopWords | kUseMaxLengthStop}
-
static constexpr UnderlyingType kUseOccurrencePenalties{kUseRepetitionPenalties | kUseFrequencyPenalties | kUsePresencePenalties}
-
static constexpr UnderlyingType kUsePenalties = {kUseOccurrencePenalties | kUseTemperature | kUseMinLength}
-
static constexpr UnderlyingType kUseBanTokens = {kUseNoRepeatNgramSize | kUseBanWords}
-
static constexpr SizeType32 kNumFlags = {10}
-
static constexpr UnderlyingType kAuto = {1u << (kNumFlags + 0)}
-
static constexpr UnderlyingType kTopK = {1u << (kNumFlags + 1)}
-
static constexpr UnderlyingType kTopP = {1u << (kNumFlags + 2)}
-
static constexpr UnderlyingType kBeamSearch = {1u << (kNumFlags + 3)}
-
static constexpr UnderlyingType kMedusa = {1u << (kNumFlags + 4)}
-
static constexpr UnderlyingType kLookahead = {1u << (kNumFlags + 5)}
-
static constexpr UnderlyingType kExplicitDraftTokens = {1u << (kNumFlags + 6)}
-
static constexpr UnderlyingType kExternalDraftTokens = {1u << (kNumFlags + 7)}
-
static constexpr UnderlyingType kEagle = {1u << (kNumFlags + 8)}
-
static constexpr UnderlyingType kTopKTopP = {kTopK | kTopP}
-
using UnderlyingType = uint32_t
-
using SizeType32 = std::int32_t
-
namespace runtime
-
namespace executor
disaggServerUtil.h
-
namespace tensorrt_llm
-
namespace executor
-
namespace disagg_executor
-
struct ResponseWithId
Public Functions
-
inline ResponseWithId(tensorrt_llm::executor::Response &&response, IdType gid)
-
inline ResponseWithId(tensorrt_llm::executor::Response const &response, IdType gid)
-
inline ResponseWithId(ResponseWithId &&other) noexcept
-
ResponseWithId(ResponseWithId const &other) = default
-
inline ResponseWithId &operator=(ResponseWithId &&other) noexcept
-
inline ResponseWithId &operator=(ResponseWithId const &other)
-
~ResponseWithId() = default
-
inline ResponseWithId(tensorrt_llm::executor::Response &&response, IdType gid)
-
class DisaggExecutorOrchestrator
Public Functions
-
DisaggExecutorOrchestrator(std::vector<std::filesystem::path> const &ctxEnginePaths, std::vector<std::filesystem::path> const &genEnginePaths, std::vector<executor::ExecutorConfig> const &ctxExecutorConfigs, std::vector<executor::ExecutorConfig> const &genExecutorConfigs, bool hasContextAwaitThreads, bool hasGenAwaitThreads)
Constructs a DisaggExecutorOrchestrator object.
- Parameters:
ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
genExecutorConfigs – A vector of ExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
-
std::vector<IdType> enqueueContext(std::vector<texec::Request> const &requests, std::optional<int> selectContextId = std::nullopt, bool batch = false)
Enqueue context-only requests to context executors.
- Parameters:
requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. If
std::nullopt
, the executor that has the smallest number of inflight requests will be used.batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
- Returns:
A vector of global request ids, corresponding to the order of the requests in
requests
, the id returned may be different from the request id in each executor.
-
void enqueueGeneration(std::vector<texec::Request> const &requests, std::vector<IdType> const &globalRequestIds, std::optional<int> selectGenIdx = std::nullopt, bool batch = false)
Enqueue generation-only requests to generation executors.
- Parameters:
requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. If
std::nullopt
, the executor that has the smallest number of inflight requests will be used.batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
-
std::vector<ResponseWithId> awaitContextResponses(std::optional<std::chrono::milliseconds> const &timeout, std::optional<int> contextIdx = std::nullopt)
Await for context responses.
- Parameters:
timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. If
std::nullopt
, return ready responses in all context executors,ifhasContextAwaitThreads
is true, then this parameter must be std::nullopt.
- Returns:
A vector of responses with corresponding global request ids
-
std::vector<ResponseWithId> awaitGenerationResponses(std::optional<std::chrono::milliseconds> const &timeout, std::optional<int> genIdx = std::nullopt)
Await for generation responses.
- Parameters:
timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. If
std::nullopt
, return ready responses in all generation executors,ifhasGenAwaitThreads
is true, then this parameter must be std::nullopt.
- Returns:
A vector of responses with corresponding global request ids.
-
bool canEnqueue() const
Indicates if the current process is allowed to enqueueRequests.
-
std::vector<std::unique_ptr<texec::Executor>> const &getContextExecutors() const
Get context executors.
-
std::vector<std::unique_ptr<texec::Executor>> const &getGenExecutors() const
Get generation executors.
-
~DisaggExecutorOrchestrator()
Private Members
-
std::unique_ptr<Impl> mImpl
-
DisaggExecutorOrchestrator(std::vector<std::filesystem::path> const &ctxEnginePaths, std::vector<std::filesystem::path> const &genEnginePaths, std::vector<executor::ExecutorConfig> const &ctxExecutorConfigs, std::vector<executor::ExecutorConfig> const &genExecutorConfigs, bool hasContextAwaitThreads, bool hasGenAwaitThreads)
-
struct ResponseWithId
-
namespace disagg_executor
-
namespace executor
tensor.h
-
namespace tensorrt_llm
-
namespace executor
-
class Shape : public tensorrt_llm::common::ArrayView<detail::DimType64 const>
Public Types
-
using Base = tensorrt_llm::common::ArrayView<detail::DimType64 const>
-
using Base = tensorrt_llm::common::ArrayView<detail::DimType64 const>
-
class Tensor
Public Types
-
using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>
Public Functions
-
Tensor copyToCpu(Tensor::CudaStreamPtr stream = nullptr) const
-
Tensor copyToPinned(Tensor::CudaStreamPtr stream = nullptr) const
-
Tensor copyToPooledPinned(Tensor::CudaStreamPtr stream = nullptr) const
-
Tensor copyToManaged(Tensor::CudaStreamPtr stream = nullptr) const
-
Tensor copyToGpu(Tensor::CudaStreamPtr stream) const
-
Tensor() noexcept = default
-
~Tensor() = default
-
void *getData()
Returns a pointer to underlying array.
-
void const *getData() const
Returns a pointer to underlying array.
-
MemoryType getMemoryType() const
Returns the memory type of the buffer.
-
std::size_t getSize() const
Returns the number of elements in the tensor.
-
std::size_t getSizeInBytes() const
Returns the size of the tensor in bytes.
-
void setZero(CudaStreamPtr stream = nullptr)
Set the entire memory to zero.
- Parameters:
stream – Must be a valid CUDA stream if the memory type is GPU.
-
void setFrom(Tensor const &other, CudaStreamPtr stream = nullptr)
Copy the data and shape from another tensor.
- Parameters:
other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.
-
inline explicit operator bool() const
Public Static Functions
-
static Tensor cpu(DataType dataType, Shape shape = {})
Allocate a cpu tensor with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
-
static Tensor pinned(DataType dataType, Shape shape = {})
Allocate a cpu tensor in pinned memory with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
-
static Tensor pooledPinned(DataType dataType, Shape shape = {})
Allocate a cpu tensor in pooled pinned memory with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
-
static Tensor managed(DataType dataType, Shape shape = {})
Allocate a tensor in managed memory (UVM) with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
-
static Tensor gpu(DataType dataType, CudaStreamPtr stream, Shape shape = {})
Allocate a gpu tensor with the given shape and data type on a particular cuda stream.
- Parameters:
shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.
-
template<typename T>
static inline Tensor gpu(CudaStreamPtr stream, Shape shape = {})
-
static Tensor of(DataType dataType, void *data, Shape shape)
Wrap a data pointer into a tensor without taking ownership.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Private Functions
-
using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>
-
class Shape : public tensorrt_llm::common::ArrayView<detail::DimType64 const>
-
namespace runtime
-
namespace executor
executor.h
-
namespace tensorrt_llm
-
-
namespace executor
Typedefs
-
using RetentionPriority = SizeType32
-
using KVCacheEventData = std::variant<KVCacheCreatedData, KVCacheStoredData, KVCacheRemovedData, KVCacheUpdatedData>
Functions
-
char const *version() noexcept
Version of TRT-LLM.
-
class SamplingConfig
- #include <executor.h>
Sampling configuration.
Public Functions
-
explicit SamplingConfig(SizeType32 beamWidth = 1, std::optional<SizeType32> const &topK = std::nullopt, std::optional<FloatType> const &topP = std::nullopt, std::optional<FloatType> const &topPMin = std::nullopt, std::optional<TokenIdType> const &topPResetIds = std::nullopt, std::optional<FloatType> const &topPDecay = std::nullopt, std::optional<RandomSeedType> const &seed = std::nullopt, std::optional<FloatType> const &temperature = std::nullopt, std::optional<SizeType32> const &minTokens = std::nullopt, std::optional<FloatType> const &beamSearchDiversityRate = std::nullopt, std::optional<FloatType> const &repetitionPenalty = std::nullopt, std::optional<FloatType> const &presencePenalty = std::nullopt, std::optional<FloatType> const &frequencyPenalty = std::nullopt, std::optional<FloatType> const &lengthPenalty = std::nullopt, std::optional<SizeType32> const &earlyStopping = std::nullopt, std::optional<SizeType32> const &noRepeatNgramSize = std::nullopt, std::optional<SizeType32> const &numReturnSequences = std::nullopt)
Constructor for SamplingConfig See description of parameters below.
-
bool operator==(SamplingConfig const &other) const
-
SizeType32 getBeamWidth() const
-
SizeType32 getNumReturnBeams() const
-
std::optional<SizeType32> getTopK() const
-
std::optional<SizeType32> getTopPResetIds() const
-
std::optional<RandomSeedType> getSeed() const
-
std::optional<RandomSeedType> getRandomSeed() const
-
std::optional<SizeType32> getMinTokens() const
-
std::optional<SizeType32> getMinLength() const
-
std::optional<SizeType32> getEarlyStopping() const
-
std::optional<SizeType32> getNoRepeatNgramSize() const
-
std::optional<SizeType32> getNumReturnSequences() const
-
void setBeamWidth(SizeType32 beamWidth)
-
void setTopK(std::optional<SizeType32> const &topK)
-
void setTopPResetIds(std::optional<TokenIdType> const &topPResetIds)
-
void setSeed(std::optional<RandomSeedType> const &seed)
-
void setRandomSeed(std::optional<RandomSeedType> const &randomSeed)
-
void setMinTokens(std::optional<SizeType32> const &minTokens)
-
void setMinLength(std::optional<SizeType32> const &minLength)
-
void setEarlyStopping(std::optional<SizeType32> const &earlyStopping)
-
void setNoRepeatNgramSize(std::optional<SizeType32> const &noRepeatNgramSize)
-
void setNumReturnSequences(std::optional<SizeType32> const &numReturnSequences)
Private Functions
-
void updateNumReturnBeams()
Private Members
-
SizeType32 mBeamWidth
The beam width. Default is 1 which disables beam search.
-
std::optional<SizeType32> mTopK
Controls number of logits to sample from. Default is 0 (all logits).
-
std::optional<FloatType> mTopPMin
Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6.
-
std::optional<TokenIdType> mTopPResetIds
Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1.
-
std::optional<FloatType> mTopPDecay
Controls decay in the top-P algorithm. The decay value. Default is 1.f.
-
std::optional<RandomSeedType> mSeed
Controls the random seed used by the random number generator in sampling.
-
std::optional<FloatType> mTemperature
Controls the modulation of logits when sampling new tokens. It can have values > 0.f. Default is 1.0f.
-
std::optional<SizeType32> mMinTokens
Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1.
-
std::optional<FloatType> mRepetitionPenalty
Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. Default is 1.f.
-
std::optional<FloatType> mPresencePenalty
Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.
-
std::optional<FloatType> mFrequencyPenalty
Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.
-
std::optional<FloatType> mLengthPenalty
Controls how to penalize longer sequences in beam search. Default is 0.f.
-
std::optional<SizeType32> mEarlyStopping
Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token)
-
std::optional<SizeType32> mNoRepeatNgramSize
Controls how many repeat ngram size are acceptable. Default is 1 << 30.
-
std::optional<SizeType32> mNumReturnSequences
The number of return sequences or beams. In beam search, the value should be less than or equal to mBeamWidth. In sampling, it specifies the total number of independently generated sequences.
-
SizeType32 mNumReturnBeams
The number of beams to return. It is equal to beamWidth unless numReturnSequences is set. If beamWidth > 1 and numReturnSequences is set, then numReturnBeams is equal to numReturnSequences.
Private Static Functions
-
static SizeType32 checkBeamWidth(SizeType32 beamWidth)
-
static std::optional<TokenIdType> const &checkTopPResetIds(std::optional<TokenIdType> const &topPResetIds)
-
static std::optional<FloatType> const &checkTemperature(std::optional<FloatType> const &temperature)
-
static std::optional<FloatType> const &checkRepetitionPenalty(std::optional<FloatType> const &penalty)
-
static std::optional<SizeType32> const &checkMinTokens(std::optional<SizeType32> const &minTokens)
-
static std::optional<SizeType32> const &checkNoRepeatNgramSize(std::optional<SizeType32> const &noRepeatNgramSize)
-
static std::optional<FloatType> const &checkBeamSearchDiversityRate(std::optional<FloatType> const &beamSearchDiversityRate)
-
static std::optional<SizeType32> const &checkNumReturnSequences(std::optional<SizeType32> const &numReturnSequences, SizeType32 beamWidth)
Friends
- friend class Serialization
-
explicit SamplingConfig(SizeType32 beamWidth = 1, std::optional<SizeType32> const &topK = std::nullopt, std::optional<FloatType> const &topP = std::nullopt, std::optional<FloatType> const &topPMin = std::nullopt, std::optional<TokenIdType> const &topPResetIds = std::nullopt, std::optional<FloatType> const &topPDecay = std::nullopt, std::optional<RandomSeedType> const &seed = std::nullopt, std::optional<FloatType> const &temperature = std::nullopt, std::optional<SizeType32> const &minTokens = std::nullopt, std::optional<FloatType> const &beamSearchDiversityRate = std::nullopt, std::optional<FloatType> const &repetitionPenalty = std::nullopt, std::optional<FloatType> const &presencePenalty = std::nullopt, std::optional<FloatType> const &frequencyPenalty = std::nullopt, std::optional<FloatType> const &lengthPenalty = std::nullopt, std::optional<SizeType32> const &earlyStopping = std::nullopt, std::optional<SizeType32> const &noRepeatNgramSize = std::nullopt, std::optional<SizeType32> const &numReturnSequences = std::nullopt)
-
class OutputConfig
- #include <executor.h>
Configuration that controls the outputs of a Result.
Public Functions
-
explicit OutputConfig(bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false, bool excludeInputFromOutput = false, bool returnEncoderOutput = false, bool returnPerfMetrics = false)
Public Members
-
bool returnGenerationLogits
Controls if Result should contain the generation logits. Default is false.
-
bool excludeInputFromOutput
Controls if output tokens in Result should include the input tokens. Default is false.
-
explicit OutputConfig(bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false, bool excludeInputFromOutput = false, bool returnEncoderOutput = false, bool returnPerfMetrics = false)
-
class ExternalDraftTokensConfig
- #include <executor.h>
Configuration for speculative decoding with external draft tokens. Allows to include draft tokens, draft logits and specify acceptance threshold.
Public Functions
-
explicit ExternalDraftTokensConfig(VecTokens tokens, std::optional<Tensor> logits = std::nullopt, std::optional<FloatType> const &acceptanceThreshold = std::nullopt, std::optional<bool> const &fastLogits = std::nullopt)
-
std::optional<bool> getFastLogits() const
Private Members
-
std::optional<bool> mFastLogits
Use direct transfer for draft logits.
Friends
- friend class Serialization
-
explicit ExternalDraftTokensConfig(VecTokens tokens, std::optional<Tensor> logits = std::nullopt, std::optional<FloatType> const &acceptanceThreshold = std::nullopt, std::optional<bool> const &fastLogits = std::nullopt)
-
class PromptTuningConfig
- #include <executor.h>
Configuration for prompt tuning.
Public Functions
-
explicit PromptTuningConfig(Tensor embeddingTable, std::optional<VecTokenExtraIds> inputTokenExtraIds = std::nullopt)
-
std::optional<VecTokenExtraIds> getInputTokenExtraIds() const
Private Members
-
Tensor mEmbeddingTable
The prompt embedding table. Expected shape: [task vocab_size, hidden_size]. Data type must match model weights.
-
std::optional<VecTokenExtraIds> mInputTokenExtraIds
The input token extra ids for KV Cache reuse when p-tuning is enabled.
Friends
- friend class Serialization
-
explicit PromptTuningConfig(Tensor embeddingTable, std::optional<VecTokenExtraIds> inputTokenExtraIds = std::nullopt)
-
class MropeConfig
- #include <executor.h>
Configuration for mrope.
Public Functions
-
explicit MropeConfig(Tensor mropeRoratySinCos, SizeType32 mropePositionDeltas)
-
SizeType32 getMRopePositionDeltas() const
Private Members
-
Tensor mMRopeRotarySinCos
The mrope rotary sin and cos cache. Expected shape: [maxPositionEmbeddings*rotaryEmbeddingDim],Data type must float32.
-
SizeType32 mMRopePositionDeltas
The mrope position deltas.
Friends
- friend class Serialization
-
explicit MropeConfig(Tensor mropeRoratySinCos, SizeType32 mropePositionDeltas)
-
class LoraConfig
- #include <executor.h>
Configuration for LoRA.
Public Functions
Private Members
Friends
- friend class Serialization
-
struct LookaheadDecodingConfig
Public Functions
-
LookaheadDecodingConfig(SizeType32 windowSize, SizeType32 ngramSize, SizeType32 verificationSetSize)
-
inline explicit LookaheadDecodingConfig()
-
bool operator==(LookaheadDecodingConfig const &other) const
-
std::tuple<SizeType32 const, SizeType32 const, SizeType32 const> get() const
-
SizeType32 getWindowSize() const
-
SizeType32 getNgramSize() const
-
SizeType32 getVerificationSetSize() const
-
std::tuple<SizeType32, SizeType32, SizeType32, SizeType32> calculateSpeculativeResource() const
return <maxDecodingTokens, maxPathLen, maxDraftTokens, maxDraftPathLen>
-
bool isLE(LookaheadDecodingConfig const &that) const
return true when
this
can be executed on resources defined bythat
Public Static Functions
-
static bool isLegal(SizeType32 windowSize, SizeType32 ngramSize, SizeType32 verificationSetSize) noexcept
return true when the parameter combination is valid.
Friends
- friend class Serialization
-
LookaheadDecodingConfig(SizeType32 windowSize, SizeType32 ngramSize, SizeType32 verificationSetSize)
-
struct EagleConfig
Public Functions
-
explicit EagleConfig(std::optional<EagleChoices> eagleChoices = std::nullopt, bool greedySampling = true, std::optional<float> posteriorThreshold = std::nullopt)
-
bool operator==(EagleConfig const &other) const
-
std::optional<EagleChoices> getEagleChoices() const
-
std::optional<float> getPosteriorThreshold() const
-
bool isGreedySampling() const
Private Functions
-
std::optional<float> const &checkPosteriorValue(std::optional<float> const &value)
Private Members
-
std::optional<EagleChoices> mEagleChoices
choices forming tree for EAGLE-1.
-
bool mGreedySampling
Flag to use greedy or typical acceptance.
-
std::optional<float> mPosteriorThreshold
Minimum token probability of the typical acceptance. Corresponds to epsilon in https://arxiv.org/pdf/2401.10774. Default is 0.09f.
Friends
- friend class Serialization
-
explicit EagleConfig(std::optional<EagleChoices> eagleChoices = std::nullopt, bool greedySampling = true, std::optional<float> posteriorThreshold = std::nullopt)
-
class ContextPhaseParams
Public Types
-
using RequestIdType = std::uint64_t
Public Functions
-
explicit ContextPhaseParams(VecTokens firstGenTokens, RequestIdType reqId)
-
ContextPhaseParams(VecTokens firstGenTokens, RequestIdType reqId, void *state)
-
ContextPhaseParams(ContextPhaseParams const&)
-
ContextPhaseParams(ContextPhaseParams&&) noexcept
-
ContextPhaseParams &operator=(ContextPhaseParams const&)
-
ContextPhaseParams &operator=(ContextPhaseParams&&) noexcept
-
~ContextPhaseParams()
-
bool operator==(ContextPhaseParams const&) const noexcept
-
RequestIdType getReqId() const noexcept
-
void const *getState() const noexcept
-
void *getState() noexcept
-
void *releaseState() noexcept
Private Members
-
RequestIdType mReqId = {0}
This request corresponds to the request ID in the context phase.
Private Static Functions
-
static void deleter(void const *data)
Friends
- friend class Serialization
-
using RequestIdType = std::uint64_t
-
class SpeculativeDecodingConfig
- #include <executor.h>
Configuration for speculative decoding (both draft and target models)
Public Functions
-
explicit SpeculativeDecodingConfig(bool fastLogits = false)
-
bool operator==(SpeculativeDecodingConfig const &other) const
Public Members
-
bool fastLogits
Send logits tensor directly from draft to target model.
-
explicit SpeculativeDecodingConfig(bool fastLogits = false)
-
class GuidedDecodingParams
- #include <executor.h>
Guided decoding parameters for a request.
Public Types
-
enum class GuideType
Values:
-
enumerator kJSON
The generated text is amenable to json format.
-
enumerator kJSON_SCHEMA
The generated text is amenable to json format with additional user-specified restrictions, namely schema.
-
enumerator kREGEX
The generated text is amenable to the user-specified regular expression.
-
enumerator kEBNF_GRAMMAR
The generated text is amenable to the user-specified extended Backus-Naur form (EBNF) grammar. EBNF grammar is widely-used to express context-free grammars.
-
enumerator kJSON
Public Functions
-
explicit GuidedDecodingParams(GuideType guideType, std::optional<std::string> guide = std::nullopt)
-
bool operator==(GuidedDecodingParams const &other) const
-
std::optional<std::string> getGuide() const
Private Members
-
std::optional<std::string> mGuide
The detailed guide string. It could be a json schema, a regular expression or a EBNF grammar depending on mGuideType.
Friends
- friend class Serialization
-
enum class GuideType
-
struct RetentionPriorityAndDuration
Public Functions
-
inline RetentionPriorityAndDuration(std::optional<RetentionPriority> const &retentionPriority, std::optional<std::chrono::milliseconds> const &durationMs)
Public Members
-
std::optional<RetentionPriority> retentionPriority
-
std::optional<std::chrono::milliseconds> durationMs
-
inline RetentionPriorityAndDuration(std::optional<RetentionPriority> const &retentionPriority, std::optional<std::chrono::milliseconds> const &durationMs)
-
class KvCacheRetentionConfig
- #include <executor.h>
Configuration for the request’s retention in the KV Cache.
Public Functions
-
inline explicit KvCacheRetentionConfig()
-
explicit KvCacheRetentionConfig(std::vector<TokenRangeRetentionConfig> const &tokenRangeRetentionPriorities, RetentionPriority decodeRetentionPriority = kDefaultRetentionPriority, std::optional<std::chrono::milliseconds> decodeDurationMs = std::nullopt)
-
std::vector<TokenRangeRetentionConfig> getTokenRangeRetentionConfigs() const
-
RetentionPriority getDecodeRetentionPriority() const
-
std::optional<std::chrono::milliseconds> getDecodeDurationMs() const
-
std::vector<RetentionPriorityAndDuration> getPerBlockRetentionPriorityDuration(SizeType32 blockSize, SizeType32 seqLen) const
Convert the token range data into an entry per kv block. Returns a tuple of vectors corresponding to the priorities and durations for each block.
Public Static Attributes
-
static constexpr RetentionPriority kMinRetentionPriority = 0
-
static constexpr RetentionPriority kMaxRetentionPriority = 100
-
static constexpr RetentionPriority kDefaultRetentionPriority = 35
Private Members
-
std::vector<TokenRangeRetentionConfig> mTokenRangeRetentionConfigs
The token ranges and priority levels to update. Ranges must be non-overlapping. For example [(0, 64), (100, 128), (70, 80)] is valid, whereas [(0, 64), (60, 128)] is not.
-
RetentionPriority mDecodeRetentionPriority
The priority level to assign to blocks allocated in the decode phase.
-
std::optional<std::chrono::milliseconds> mDecodeDurationMs
The duration in ms that decode blocks should remain at their assigned priority level.
-
struct TokenRangeRetentionConfig
- #include <executor.h>
A single entry to set block priorities over a token range. Earlier ranges always take priority over later ones. For example, with a block size of 16, a range of [0, 17] would be applied to the first two blocks.
Public Functions
-
inline explicit TokenRangeRetentionConfig(SizeType32 tokenStart, std::optional<SizeType32> tokenEnd = std::nullopt, RetentionPriority priority = KvCacheRetentionConfig::kDefaultRetentionPriority, std::optional<std::chrono::milliseconds> durationMs = std::nullopt)
-
inline bool operator==(TokenRangeRetentionConfig const &other) const
Public Members
-
SizeType32 tokenStart
The first token of this range.
-
std::optional<SizeType32> tokenEnd
The final token of this range. The end is not included in the range. This can be set to std::nullopt to extend the range to the end of the sequence.
-
RetentionPriority priority
The priority of this token range. Higher priorities are less likely to be evicted or offloaded.
-
std::optional<std::chrono::milliseconds> durationMs
The duration in ms that the block should remain at the given priority level. Set to std::nullopt to have no expiration time, and keep the block at the given priority level until it gets reclaimed. After the duration has passed, the block will be moved back to the
kDefaultRetentionPriority
level.
-
inline explicit TokenRangeRetentionConfig(SizeType32 tokenStart, std::optional<SizeType32> tokenEnd = std::nullopt, RetentionPriority priority = KvCacheRetentionConfig::kDefaultRetentionPriority, std::optional<std::chrono::milliseconds> durationMs = std::nullopt)
-
inline explicit KvCacheRetentionConfig()
-
class Request
- #include <executor.h>
A class that holds information about the request.
Public Functions
-
Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming = false, SamplingConfig const &samplingConfig = SamplingConfig(), OutputConfig const &outputConfig = OutputConfig(), std::optional<SizeType32> const &endId = std::nullopt, std::optional<SizeType32> const &padId = std::nullopt, std::optional<std::vector<SizeType32>> positionIds = std::nullopt, std::optional<std::list<VecTokens>> badWords = std::nullopt, std::optional<std::list<VecTokens>> stopWords = std::nullopt, std::optional<Tensor> embeddingBias = std::nullopt, std::optional<ExternalDraftTokensConfig> externalDraftTokensConfig = std::nullopt, std::optional<PromptTuningConfig> pTuningConfig = std::nullopt, std::optional<MropeConfig> mRopeConfig = std::nullopt, std::optional<LoraConfig> loraConfig = std::nullopt, std::optional<LookaheadDecodingConfig> lookaheadConfig = std::nullopt, std::optional<KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt, std::optional<std::string> logitsPostProcessorName = std::nullopt, std::optional<VecTokens> encoderInputTokenIds = std::nullopt, std::optional<IdType> clientId = std::nullopt, bool returnAllGeneratedTokens = false, PriorityType priority = kDefaultPriority, RequestType type = RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION, std::optional<ContextPhaseParams> contextPhaseParams = std::nullopt, std::optional<Tensor> encoderInputFeatures = std::nullopt, std::optional<SizeType32> encoderOutputLength = std::nullopt, std::optional<Tensor> crossAttentionMask = std::nullopt, SizeType32 numReturnSequences = 1, std::optional<EagleConfig> eagleConfig = std::nullopt, std::optional<Tensor> skipCrossAttnBlocks = std::nullopt, std::optional<GuidedDecodingParams> guidedDecodingParams = std::nullopt, std::optional<MillisecondsType> allottedTimeMs = std::nullopt)
The Request constructor.
- Parameters:
inputTokenIds – The input token ids
maxTokens – The maximum number of tokens to generate
streaming – Indicates if the responses should be streamed or not. Default is false.
samplingConfig – The sampling config
-
Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming = false, SamplingConfig const &samplingConfig = SamplingConfig(), OutputConfig const &outputConfig = OutputConfig(), std::optional<SizeType32> const &endId = std::nullopt, std::optional<SizeType32> const &padId = std::nullopt, std::optional<std::vector<SizeType32>> positionIds = std::nullopt, std::optional<std::list<VecTokens>> badWords = std::nullopt, std::optional<std::list<VecTokens>> stopWords = std::nullopt, std::optional<Tensor> embeddingBias = std::nullopt, std::optional<ExternalDraftTokensConfig> externalDraftTokensConfig = std::nullopt, std::optional<PromptTuningConfig> pTuningConfig = std::nullopt, std::optional<MropeConfig> mRopeConfig = std::nullopt, std::optional<LoraConfig> loraConfig = std::nullopt, std::optional<LookaheadDecodingConfig> lookaheadConfig = std::nullopt, std::optional<KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt, std::optional<std::string> logitsPostProcessorName = std::nullopt, std::optional<VecTokens> encoderInputTokenIds = std::nullopt, std::optional<IdType> clientId = std::nullopt, bool returnAllGeneratedTokens = false, PriorityType priority = kDefaultPriority, RequestType type = RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION, std::optional<ContextPhaseParams> contextPhaseParams = std::nullopt, std::optional<Tensor> encoderInputFeatures = std::nullopt, std::optional<SizeType32> encoderOutputLength = std::nullopt, std::optional<Tensor> crossAttentionMask = std::nullopt, SizeType32 numReturnSequences = 1, std::optional<EagleConfig> eagleConfig = std::nullopt, std::optional<Tensor> skipCrossAttnBlocks = std::nullopt, std::optional<GuidedDecodingParams> guidedDecodingParams = std::nullopt, std::optional<MillisecondsType> allottedTimeMs = std::nullopt)
-
using RetentionPriority = SizeType32
-
namespace executor