Executor#
disaggServerUtil.h#
-
namespace tensorrt_llm#
-
namespace executor#
-
namespace disagg_executor#
-
class DisaggExecutorOrchestrator#
Public Functions
- DisaggExecutorOrchestrator(
- std::vector<std::filesystem::path> const &ctxEnginePaths,
- std::vector<std::filesystem::path> const &genEnginePaths,
- std::vector<executor::ExecutorConfig> const &ctxExecutorConfigs,
- std::vector<executor::ExecutorConfig> const &genExecutorConfigs,
- bool hasContextAwaitThreads,
- bool hasGenAwaitThreads,
Constructs a DisaggExecutorOrchestrator object.
- Parameters:
ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
genExecutorConfigs – A vector of ExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
- std::vector<IdType> enqueueContext(
- std::vector<texec::Request> const &requests,
- std::optional<int> selectContextId = std::nullopt,
- bool batch = false,
Enqueue context-only requests to context executors.
- Parameters:
requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. If
std::nullopt
, the executor that has the smallest number of inflight requests will be used.batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
- Returns:
A vector of global request ids, corresponding to the order of the requests in
requests
, the id returned may be different from the request id in each executor.
- void enqueueGeneration(
- std::vector<texec::Request> const &requests,
- std::vector<IdType> const &globalRequestIds,
- std::optional<int> selectGenIdx = std::nullopt,
- bool batch = false,
Enqueue generation-only requests to generation executors.
- Parameters:
requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. If
std::nullopt
, the executor that has the smallest number of inflight requests will be used.batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
- std::vector<ResponseWithId> awaitContextResponses(
- std::optional<std::chrono::milliseconds> const &timeout,
- std::optional<int> contextIdx = std::nullopt,
Await for context responses.
- Parameters:
timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. If
std::nullopt
, return ready responses in all context executors,ifhasContextAwaitThreads
is true, then this parameter must be std::nullopt.
- Returns:
A vector of responses with corresponding global request ids
- std::vector<ResponseWithId> awaitGenerationResponses(
- std::optional<std::chrono::milliseconds> const &timeout,
- std::optional<int> genIdx = std::nullopt,
Await for generation responses.
- Parameters:
timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. If
std::nullopt
, return ready responses in all generation executors,ifhasGenAwaitThreads
is true, then this parameter must be std::nullopt.
- Returns:
A vector of responses with corresponding global request ids.
-
bool canEnqueue() const#
Indicates if the current process is allowed to enqueueRequests.
- std::vector<std::unique_ptr<texec::Executor>> const &getContextExecutors(
Get context executors.
- std::vector<std::unique_ptr<texec::Executor>> const &getGenExecutors(
Get generation executors.
-
~DisaggExecutorOrchestrator()#
Private Members
-
std::unique_ptr<Impl> mImpl#
-
struct ResponseWithId#
Public Functions
- inline ResponseWithId(
- tensorrt_llm::executor::Response &&response,
- IdType gid,
- inline ResponseWithId(
- tensorrt_llm::executor::Response const &response,
- IdType gid,
-
inline ResponseWithId(ResponseWithId &&other) noexcept#
-
ResponseWithId(ResponseWithId const &other) = default#
-
inline ResponseWithId &operator=(ResponseWithId &&other) noexcept#
-
inline ResponseWithId &operator=(ResponseWithId const &other)#
-
~ResponseWithId() = default#
-
class DisaggExecutorOrchestrator#
-
namespace disagg_executor#
-
namespace executor#
executor.h#
-
namespace tensorrt_llm
-
-
namespace executor
Typedefs
-
using RetentionPriority = SizeType32#
-
using KVCacheEventData = std::variant<KVCacheCreatedData, KVCacheStoredData, KVCacheRemovedData, KVCacheUpdatedData>#
Functions
-
char const *version() noexcept#
Version of TRT-LLM.
-
struct AdditionalOutput#
-
class ContextPhaseParams#
Public Types
-
using RequestIdType = std::uint64_t#
Public Functions
- explicit ContextPhaseParams(
- VecTokens firstGenTokens,
- RequestIdType reqId,
- ContextPhaseParams(
- VecTokens firstGenTokens,
- RequestIdType reqId,
- void *state,
-
ContextPhaseParams(ContextPhaseParams const&)#
-
ContextPhaseParams(ContextPhaseParams&&) noexcept#
-
ContextPhaseParams &operator=(ContextPhaseParams const&)#
-
ContextPhaseParams &operator=(ContextPhaseParams&&) noexcept#
-
~ContextPhaseParams()#
-
bool operator==(ContextPhaseParams const&) const noexcept#
-
RequestIdType getReqId() const noexcept#
-
void const *getState() const noexcept#
-
void *getState() noexcept#
-
void *releaseState() noexcept#
Private Members
-
RequestIdType mReqId = {0}#
This request corresponds to the request ID in the context phase.
Private Static Functions
-
static void deleter(void const *data)#
Friends
- friend class Serialization
-
using RequestIdType = std::uint64_t#
-
class DebugConfig#
- #include <executor.h>
Configuration class for debugging output.
Public Functions
- explicit DebugConfig(
- bool debugInputTensors = false,
- bool debugOutputTensors = false,
- StringVec debugTensorNames = {},
- SizeType32 debugTensorsMaxIterations = 0,
-
bool operator==(DebugConfig const &other) const#
-
bool getDebugInputTensors() const#
-
bool getDebugOutputTensors() const#
-
SizeType32 getDebugTensorsMaxIterations() const#
-
void setDebugInputTensors(bool debugInputTensors)#
-
void setDebugOutputTensors(bool debugOutputTensors)#
- void setDebugTensorsMaxIterations(
- SizeType32 debugTensorsMaxIterations,
Private Types
-
using StringVec = std::vector<std::string>#
Private Members
-
bool mDebugInputTensors#
If true, debug all input tensors.
-
bool mDebugOutputTensors#
If true, debug all output tensors.
-
SizeType32 mDebugTensorsMaxIterations#
If > 0, provide debug tensors for at most debugTensorsMaxIterations past iterations, else dump them to files.
Friends
- friend class Serialization
-
class DecodingConfig#
- #include <executor.h>
Configuration class for the decoding.
Public Functions
- explicit DecodingConfig(
- std::optional<DecodingMode> decodingMode = std::nullopt,
- std::optional<LookaheadDecodingConfig> lookaheadDecodingConfig = std::nullopt,
- std::optional<MedusaChoices> medusaChoices = std::nullopt,
- std::optional<EagleConfig> eagleConfig = std::nullopt,
-
bool operator==(DecodingConfig const &other) const#
-
void setDecodingMode(DecodingMode const&)#
Sets decoding mode. Some modes require the use of their own setters.
-
std::optional<DecodingMode> getDecodingMode() const#
- void setLookaheadDecoding(
- LookaheadDecodingConfig const &lookaheadDecodingConfig,
Sets lookahead decoding mode and config.
-
void enableSeamlessLookaheadDecoding()#
- std::optional<LookaheadDecodingConfig> getLookaheadDecodingConfig(
-
SizeType32 getLookaheadDecodingMaxNumRequest() const#
-
void setMedusaChoices(MedusaChoices const&)#
Sets medusa mode and config.
-
std::optional<MedusaChoices> getMedusaChoices() const#
-
void setEagleConfig(EagleConfig const&)#
Sets eagle mode and config.
-
std::optional<EagleConfig> getEagleConfig() const#
Private Members
-
std::optional<DecodingMode> mDecodingMode#
-
std::optional<LookaheadDecodingConfig> mLookaheadDecodingConfig#
-
std::optional<MedusaChoices> mMedusaChoices#
-
std::optional<EagleConfig> mEagleConfig#
Private Static Attributes
-
static constexpr SizeType32 mLookaheadDecodingMaxNumRequest = 8#
Friends
- friend class Serialization
-
class DynamicBatchConfig#
- #include <executor.h>
Configuration class for dynamic tuning of batch size and max num tokens. During runtime the statistics of input and output lengths are recoreded. Based on these statistics, the batch size and max num tokens are tuned dynamically to better serve the requests.
Public Functions
- explicit DynamicBatchConfig(
- bool enableBatchSizeTuning = false,
- bool enableMaxNumTokensTuning = false,
- SizeType32 dynamicBatchMovingAverageWindow = kDefaultDynamicBatchMovingAverageWindow,
- std::vector<std::pair<SizeType32, SizeType32>> batchSizeTable = kDefaultBatchSizeTable,
-
SizeType32 getDynamicBatchMovingAverageWindow() const#
-
bool getEnableBatchSizeTuning() const#
-
bool getEnableMaxNumTokensTuning() const#
- std::vector<std::pair<SizeType32, SizeType32>> getBatchSizeTable(
Public Static Attributes
-
static SizeType32 const kDefaultDynamicBatchMovingAverageWindow = 128#
The default window size for moving average of input and output length which is used to calculate dynamic batch size and max num tokens.
-
static std::vector<std::pair<SizeType32, SizeType32>> const kDefaultBatchSizeTable#
The default value of batch size table.
Private Members
-
bool mEnableBatchSizeTuning#
Controls if the batch size should be tuned dynamically.
-
bool mEnableMaxNumTokensTuning#
Controls if the max num tokens should be tuned dynamically.
-
SizeType32 mDynamicBatchMovingAverageWindow#
The window size for moving average of input and output length which is used to calculate dynamic batch size and max num tokens.
-
std::vector<std::pair<SizeType32, SizeType32>> mBatchSizeTable#
A vector of (batchSizeLimit, batchSize). When max capacity batch size is less than.
Friends
- friend class Serialization
-
struct EagleConfig#
Public Functions
- explicit EagleConfig(
- std::optional<EagleChoices> eagleChoices = std::nullopt,
- bool greedySampling = true,
- std::optional<float> posteriorThreshold = std::nullopt,
-
bool operator==(EagleConfig const &other) const#
-
std::optional<EagleChoices> getEagleChoices() const#
-
std::optional<float> getPosteriorThreshold() const#
-
bool isGreedySampling() const#
Private Functions
- std::optional<float> const &checkPosteriorValue(
- std::optional<float> const &value,
Private Members
-
std::optional<EagleChoices> mEagleChoices#
choices forming tree for EAGLE-1.
-
bool mGreedySampling#
Flag to use greedy or typical acceptance.
-
std::optional<float> mPosteriorThreshold#
Minimum token probability of the typical acceptance. Corresponds to epsilon in https://arxiv.org/pdf/2401.10774. Default is 0.09f.
Friends
- friend class Serialization
-
class Executor#
- #include <executor.h>
The executor is responsible for receiving new requests and sending responses, and running the inference.
Public Functions
- Executor(
- std::filesystem::path const &modelPath,
- ModelType modelType,
- ExecutorConfig const &executorConfig,
- Parameters:
modelPath – Path to the folder that defines the model to run
modelType – The type of model
executorConfig – The configuration for the executor
- Executor(
- std::filesystem::path const &encoderModelPath,
- std::filesystem::path const &decoderModelPath,
- ModelType modelType,
- ExecutorConfig const &executorConfig,
- Executor(
- BufferView const &engineBuffer,
- std::string const &jsonConfigStr,
- ModelType modelType,
- ExecutorConfig const &executorConfig,
- std::optional<std::map<std::string, Tensor>> const &managedWeights = std::nullopt,
- Executor(
- BufferView const &encoderEngineBuffer,
- std::string const &encoderJsonConfigStr,
- BufferView const &decoderEngineBuffer,
- std::string const &decoderJsonConfigStr,
- ModelType modelType,
- ExecutorConfig const &executorConfig,
- std::shared_ptr<Model> model,
- ExecutorConfig const &executorConfig,
- std::shared_ptr<Model> encoderModel,
- std::shared_ptr<Model> decoderModel,
- ExecutorConfig const &executorConfig,
-
~Executor()#
-
IdType enqueueRequest(Request const &request)#
Enqueue a new request.
- Parameters:
request – The LLM request which contains input tokens and request parameters
- Returns:
A unique id that identifies the request
- std::vector<IdType> enqueueRequests(
- std::vector<Request> const &requests,
Enqueue a batch of request.
- std::vector<Response> awaitResponses(
- std::optional<std::chrono::milliseconds> const &timeout = std::nullopt,
Await for ready responses.
This overload awaits for any ready responses. In particular, if several requests have been enqueued, this method will provide any ready responses without order guarantees.
- Parameters:
timeout – The maximum time to wait for new responses
- Returns:
A vector of responses
- std::vector<Response> awaitResponses(
- IdType const &requestId,
- std::optional<std::chrono::milliseconds> const &timeout = std::nullopt,
Await for ready responses.
- Parameters:
id – A request id
timeout – The maximum time to wait for new responses
- Returns:
A vector of responses
- std::vector<std::vector<Response>> awaitResponses(
- std::vector<IdType> const &requestIds,
- std::optional<std::chrono::milliseconds> const &timeout = std::nullopt,
Await for multiple ready responses.
A multiple ID request behaves as if awaitResponses(IdType, timeout) were invoked on all IDs. The returned vector contains a vector of responses per ID in the same order specified by the requestIds. The same behaviour as awaitResponses(IdType, timeout) applies: * Responses may be empty. * If all responses have already been given for one of the requestIds, then this method will hang unless a timeout is specified.
- Parameters:
requestIds – Ids requested
timeout – The maximum time to wait for new responses
- Returns:
A vector of vector of responses
- SizeType32 getNumResponsesReady(
- std::optional<IdType> const &requestId = std::nullopt,
Get the number of ready responses.
- Parameters:
requestId – An optional request id
- Returns:
The number of ready responses
-
void cancelRequest(IdType requestId)#
Cancel the request with provided request id.
- Parameters:
id – The request id for which to cancel the response
-
void shutdown()#
Signals the server to shutdown.
This call is blocking. Only returns when all requests have terminated or timeout has been reached
-
std::deque<IterationStats> getLatestIterationStats()#
Returns the per-iterations statistics computed since last call to getLatestIterationStats. Contains at most iterStatsMaxIterations iterations.
- Returns:
Iteration stats
-
std::deque<RequestStatsPerIteration> getLatestRequestStats()#
Returns the request stats of each iteration computed since last call to getLatestRequestStats. Contains at most requestStatsMaxIterations iterations.
- Returns:
Request stats grouped by iterations
-
std::deque<DebugTensorsPerIteration> getLatestDebugTensors()#
Returns the debug tensors of each iteration computed since last call to getLatestDebugTensors. Contains at most debugTensorsMaxIterations iterations.
- Returns:
Request debug tensors grouped by iterations
-
bool canEnqueueRequests() const#
Indicates if the current process is allowed to enqueueRequests.
-
bool isParticipant() const#
Indicates if the current process participates in this executor instance.
- std::optional<std::shared_ptr<KVCacheEventManager>> getKVCacheEventManager(
Private Members
-
std::unique_ptr<Impl> mImpl#
-
class ExecutorConfig#
- #include <executor.h>
Configuration class for the model executor.
Public Functions
- explicit ExecutorConfig(
- SizeType32 maxBeamWidth = 1,
- SchedulerConfig schedulerConfig = SchedulerConfig(),
- KvCacheConfig kvCacheConfig = KvCacheConfig(),
- bool enableChunkedContext = true,
- bool normalizeLogProbs = true,
- SizeType32 iterStatsMaxIterations = kDefaultIterStatsMaxIterations,
- SizeType32 requestStatsMaxIterations = kDefaultRequestStatsMaxIterations,
- BatchingType batchingType = BatchingType::kINFLIGHT,
- std::optional<SizeType32> maxBatchSize = std::nullopt,
- std::optional<SizeType32> maxNumTokens = std::nullopt,
- std::optional<ParallelConfig> parallelConfig = std::nullopt,
- std::optional<PeftCacheConfig> const &peftCacheConfig = std::nullopt,
- std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig = std::nullopt,
- std::optional<DecodingConfig> decodingConfig = std::nullopt,
- float gpuWeightsPercent = 1,
- std::optional<SizeType32> maxQueueSize = std::nullopt,
- ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig(),
- std::optional<DebugConfig> debugConfig = std::nullopt,
- SizeType32 recvPollPeriodMs = 0,
- uint64_t maxSeqIdleMicroseconds = kDefaultMaxSeqIdleMicroseconds,
- std::optional<SpeculativeDecodingConfig> specDecConfig = std::nullopt,
- std::optional<GuidedDecodingConfig> guidedDecodingConfig = std::nullopt,
- std::optional<std::vector<std::string>> additionalOutputNames = std::nullopt,
-
SizeType32 getMaxBeamWidth() const#
-
SchedulerConfig getSchedulerConfig() const#
-
KvCacheConfig getKvCacheConfig() const#
-
SchedulerConfig &getSchedulerConfigRef()#
-
KvCacheConfig &getKvCacheConfigRef()#
-
bool getEnableChunkedContext() const#
-
bool getNormalizeLogProbs() const#
-
SizeType32 getIterStatsMaxIterations() const#
-
SizeType32 getRequestStatsMaxIterations() const#
-
BatchingType getBatchingType() const#
-
std::optional<SizeType32> getMaxBatchSize() const#
-
std::optional<SizeType32> getMaxNumTokens() const#
-
std::optional<ParallelConfig> getParallelConfig() const#
-
std::optional<PeftCacheConfig> getPeftCacheConfig() const#
- std::optional<LogitsPostProcessorConfig> getLogitsPostProcessorConfig(
-
std::optional<DecodingConfig> getDecodingConfig() const#
-
float getGpuWeightsPercent() const#
-
std::optional<SizeType32> getMaxQueueSize() const#
- ExtendedRuntimePerfKnobConfig getExtendedRuntimePerfKnobConfig(
-
std::optional<DebugConfig> getDebugConfig() const#
-
SizeType32 getRecvPollPeriodMs() const#
-
uint64_t getMaxSeqIdleMicroseconds() const#
-
std::optional<SpeculativeDecodingConfig> getSpecDecConfig() const#
-
std::optional<GuidedDecodingConfig> getGuidedDecodingConfig() const#
- std::optional<std::vector<std::string>> getAdditionalOutputNames(
-
void setMaxBeamWidth(SizeType32 maxBeamWidth)#
-
void setMaxBatchSize(SizeType32 maxBatchSize)#
-
void setMaxNumTokens(SizeType32 maxNumTokens)#
-
void setSchedulerConfig(SchedulerConfig const &schedulerConfig)#
-
void setKvCacheConfig(KvCacheConfig const &kvCacheConfig)#
-
void setEnableChunkedContext(bool enableChunkedContext)#
-
void setNormalizeLogProbs(bool normalizeLogProbs)#
-
void setIterStatsMaxIterations(SizeType32 iterStatsMaxIterations)#
- void setRequestStatsMaxIterations(
- SizeType32 requestStatsMaxIterations,
-
void setBatchingType(BatchingType batchingType)#
-
void setParallelConfig(ParallelConfig const ¶llelConfig)#
-
void setPeftCacheConfig(PeftCacheConfig const &peftCacheConfig)#
- void setLogitsPostProcessorConfig(
- LogitsPostProcessorConfig const &logitsPostProcessorConfig,
-
void setDecodingConfig(DecodingConfig const &decodingConfig)#
-
void setGpuWeightsPercent(float const &gpuWeightsPercent)#
-
void setMaxQueueSize(std::optional<SizeType32> const &maxQueueSize)#
- void setExtendedRuntimePerfKnobConfig(
- ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig,
-
void setDebugConfig(DebugConfig const &debugConfig)#
-
void setRecvPollPeriodMs(SizeType32 const &recvPollPeriodMs)#
-
void setMaxSeqIdleMicroseconds(uint64_t maxNumTokens)#
-
void setSpecDecConfig(SpeculativeDecodingConfig const &specDecConfig)#
- void setGuidedDecodingConfig(
- GuidedDecodingConfig const &guidedDecodingConfig,
- void setAdditionalOutputNames(
- std::vector<std::string> const &additionalOutputNames,
Public Static Attributes
-
static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000#
-
static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000#
-
static constexpr SizeType32 kDefaultRequestStatsMaxIterations = 0#
Private Members
-
SizeType32 mMaxBeamWidth#
The beam width value of requests that will be sent to the executor.
-
SchedulerConfig mSchedulerConfig#
The scheduler configuration.
-
KvCacheConfig mKvCacheConfig#
The KV cache configuration.
-
bool mEnableChunkedContext#
The KV cache configuration.
-
bool mNormalizeLogProbs#
Controls if log probabilities should be normalized or not.
-
SizeType32 mIterStatsMaxIterations#
Controls the maximum number of iterations for which to keep statistics.
-
SizeType32 mRequestStatsMaxIterations#
Controls the maximum number of iterations for which to keep per-request statistics.
-
BatchingType mBatchingType#
The type of batching strategy to use. See BatchingType.
-
std::optional<SizeType32> mMaxBatchSize#
The max batch size of requests.
-
std::optional<SizeType32> mMaxNumTokens#
The max number of tokens per batch.
-
std::optional<ParallelConfig> mParallelConfig#
The parallel execution configuration.
-
std::optional<PeftCacheConfig> mPeftCacheConfig#
-
std::optional<LogitsPostProcessorConfig> mLogitsPostProcessorConfig#
Logits post processor configuration.
-
std::optional<DecodingConfig> mDecodingConfig#
Decoding configuration.
-
float mGpuWeightsPercent#
GPU weights percent for weight streaming.
-
std::optional<SizeType32> mMaxQueueSize#
The maximum number of requests allowed in queue before rejecting new requests.
-
ExtendedRuntimePerfKnobConfig mExtendedRuntimePerfKnobConfig#
Config for perf knobs that can be set in runtime.
-
std::optional<DebugConfig> mDebugConfig#
Debugging configuration.
-
SizeType32 mRecvPollPeriodMs#
The time in ms between polls for new communication in orchestrator mode. Use 0 for busy loop.
-
uint64_t mMaxSeqIdleMicroseconds#
The maximum time in microseconds a scheduled request can remain idle before getting terminated. Default is 3 minutes.
-
std::optional<SpeculativeDecodingConfig> mSpeculativeDecodingConfig#
The speculative decoding configuration.
-
std::optional<GuidedDecodingConfig> mGuidedDecodingConfig#
The guided decoding configuration.
-
std::optional<std::vector<std::string>> mAdditionalOutputNames#
The additional output tensor names.
Friends
- friend class Serialization
-
class ExtendedRuntimePerfKnobConfig#
- #include <executor.h>
Configuration class for the runtime perf knobs.
Public Functions
- explicit ExtendedRuntimePerfKnobConfig(
- bool multiBlockMode = true,
- bool enableContextFMHAFP32Acc = false,
- bool cudaGraphMode = false,
- SizeType32 cudaGraphCacheSize = 0,
- inline bool operator==(
- ExtendedRuntimePerfKnobConfig const &other,
-
bool getMultiBlockMode() const#
-
bool getEnableContextFMHAFP32Acc() const#
-
bool getCudaGraphMode() const#
-
SizeType32 getCudaGraphCacheSize() const#
-
void setMultiBlockMode(bool multiBlockMode)#
-
void setEnableContextFMHAFP32Acc(bool enableContextFMHAFP32Acc)#
-
void setCudaGraphMode(bool cudaGraphMode)#
-
void setCudaGraphCacheSize(SizeType32 cacheSize)#
Private Members
-
bool mMultiBlockMode#
Control if multi block mode should be enabled or not.
-
bool mEnableContextFMHAFP32Acc#
If enable FMHA runner FP32 accumulation.
-
bool mCudaGraphMode#
Control if enable cuda graph.
-
SizeType32 mCudaGraphCacheSize#
Number of cuda graphs to be cached in the runtime. The larger the cache, the better the perf, but more GPU memory is consumed.
Friends
- friend class Serialization
-
class ExternalDraftTokensConfig#
- #include <executor.h>
Configuration for speculative decoding with external draft tokens. Allows to include draft tokens, draft logits and specify acceptance threshold.
Public Functions
- explicit ExternalDraftTokensConfig(
- VecTokens tokens,
- std::optional<Tensor> logits = std::nullopt,
- std::optional<FloatType> const &acceptanceThreshold = std::nullopt,
- std::optional<bool> const &fastLogits = std::nullopt,
-
std::optional<bool> getFastLogits() const#
Private Members
-
std::optional<bool> mFastLogits#
Use direct transfer for draft logits.
Friends
- friend class Serialization
-
class GuidedDecodingConfig#
- #include <executor.h>
Guided decoding configurations for executor.
Public Types
Public Functions
- explicit GuidedDecodingConfig(
- GuidedDecodingBackend backend,
- std::optional<std::vector<std::string>> encodedVocab = std::nullopt,
- std::optional<std::string> tokenizerStr = std::nullopt,
- std::optional<std::vector<TokenIdType>> stopTokenIds = std::nullopt,
-
bool operator==(GuidedDecodingConfig const &other) const#
-
void setBackend(GuidedDecodingBackend const &backend)#
-
GuidedDecodingBackend getBackend() const#
-
void setEncodedVocab(std::vector<std::string> const &encodedVocab)#
-
std::optional<std::vector<std::string>> getEncodedVocab() const#
-
void setTokenizerStr(std::string const &tokenizerStr)#
-
std::optional<std::string> getTokenizerStr() const#
-
void setStopTokenIds(std::vector<TokenIdType> const &stopTokenIds)#
-
std::optional<std::vector<TokenIdType>> getStopTokenIds() const#
-
void validate() const#
Private Members
-
GuidedDecodingBackend mBackend#
Guided decoding backend. Currently supports XGrammar.
-
std::optional<std::vector<std::string>> mEncodedVocab#
Encoded vocabulary. For a huggingface tokenizer, it can be extracted by:
encoded_vocab = tokenizer.get_vocab() encoded_vocab = [token for token, _ in sorted(encoded_vocab.items(), key=lambda x: x[1])]
-
std::optional<std::string> mTokenizerStr#
Tokenizer string. For a huggingface fast tokenizer, it can be extracted by:
tokenizer_str = tokenizer.backend_tokenizer.to_str()
-
std::optional<std::vector<TokenIdType>> mStopTokenIds#
Stop token ids. If not provided, it can be automatically detected.
Friends
- friend class Serialization
-
class GuidedDecodingParams#
- #include <executor.h>
Guided decoding parameters for a request.
Public Types
-
enum class GuideType#
Values:
-
enumerator kJSON#
The generated text is amenable to json format.
-
enumerator kJSON_SCHEMA#
The generated text is amenable to json format with additional user-specified restrictions, namely schema.
-
enumerator kREGEX#
The generated text is amenable to the user-specified regular expression.
-
enumerator kEBNF_GRAMMAR#
The generated text is amenable to the user-specified extended Backus-Naur form (EBNF) grammar. EBNF grammar is widely-used to express context-free grammars.
-
enumerator kJSON#
Public Functions
- explicit GuidedDecodingParams(
- GuideType guideType,
- std::optional<std::string> guide = std::nullopt,
-
bool operator==(GuidedDecodingParams const &other) const#
-
std::optional<std::string> getGuide() const#
Private Members
-
std::optional<std::string> mGuide#
The detailed guide string. It could be a json schema, a regular expression or a EBNF grammar depending on mGuideType.
Friends
- friend class Serialization
-
enum class GuideType#
-
class JsonSerialization#
- #include <executor.h>
Class with utility functions to serialize statistics to json string.
Public Static Functions
-
static std::string toJsonStr(IterationStats const &iterationStats)#
Utility function to convert an iterationStats struct to a json serialized string.
- static std::string toJsonStr(
- RequestStatsPerIteration const &requestStatsPerIter,
Utility function to convert a requestStatsPerIteration struct to a json serialized string.
-
static std::string toJsonStr(RequestStats const &requestStats)#
Utility function to convert a requestStats struct to a json serialized string.
-
static std::string toJsonStr(IterationStats const &iterationStats)#
-
class KvCacheConfig#
- #include <executor.h>
Configuration class for the KV cache.
Public Functions
- explicit KvCacheConfig(
- bool enableBlockReuse = true,
- std::optional<SizeType32> const &maxTokens = std::nullopt,
- std::optional<std::vector<SizeType32>> const &maxAttentionWindowVec = std::nullopt,
- std::optional<SizeType32> const &sinkTokenLength = std::nullopt,
- std::optional<FloatType> const &freeGpuMemoryFraction = std::nullopt,
- std::optional<size_t> const &hostCacheSize = std::nullopt,
- bool onboardBlocks = true,
- std::optional<FloatType> const &crossKvCacheFraction = std::nullopt,
- std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
- size_t eventBufferMaxSize = 0,
- std::optional<tensorrt_llm::runtime::RuntimeDefaults> const &runtimeDefaults = std::nullopt,
-
bool getEnableBlockReuse() const#
-
std::optional<SizeType32> getMaxTokens() const#
- std::optional<std::vector<SizeType32>> getMaxAttentionWindowVec(
-
std::optional<SizeType32> getSinkTokenLength() const#
-
std::optional<size_t> getHostCacheSize() const#
-
bool getOnboardBlocks() const#
- std::optional<RetentionPriority> getSecondaryOffloadMinPriority(
-
size_t getEventBufferMaxSize() const#
-
void setEnableBlockReuse(bool enableBlockReuse)#
-
void setMaxTokens(SizeType32 maxTokens)#
- void setMaxAttentionWindowVec(
- std::vector<SizeType32> maxAttentionWindowVec,
-
void setSinkTokenLength(SizeType32 sinkTokenLength)#
-
void setHostCacheSize(size_t hostCacheSize)#
-
void setOnboardBlocks(bool onboardBlocks)#
- void setSecondaryOffloadMinPriority(
- std::optional<RetentionPriority> secondaryOffloadMinPriority,
-
void setEventBufferMaxSize(size_t eventBufferMaxSize)#
- void fillEmptyFieldsFromRuntimeDefaults(
- tensorrt_llm::runtime::RuntimeDefaults runtimeDefaults,
Private Members
-
bool mEnableBlockReuse#
Controls if KV cache blocks can be reused for different requests.
-
std::optional<SizeType32> mMaxTokens#
The maximum number of tokens that should be stored in the KV cache If both mMaxTokens and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.
-
std::optional<std::vector<SizeType32>> mMaxAttentionWindowVec#
Size of the attention window for each sequence. Only the last mMaxAttentionWindow tokens of each sequence will be stored in the KV cache. Different layers may have different max attention window sizes. If the number of elements in mMaxAttentionWindowVec is less than the number of layers, mMaxAttentionWindowVec will be repeated multiple times to the number of layers.
-
std::optional<SizeType32> mSinkTokenLength#
Number of sink tokens (tokens to always keep in attention window)
-
std::optional<FloatType> mFreeGpuMemoryFraction#
The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both mMaxTokens and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.
-
std::optional<FloatType> mCrossKvCacheFraction#
The fraction of the KV Cache memory should be reserved for cross attention If set to p, self attention will use 1-p of KV Cache memory and cross attention will use p of KV Cache memory. Default is 50%. Should only be set when using encoder-decoder model.
-
std::optional<size_t> mHostCacheSize#
Size of secondary memory pool in bytes. Default is 0. Having a secondary memory pool increases KV cache block reuse potential.
-
bool mOnboardBlocks#
Controls whether offloaded blocks should be onboarded back into primary memory before being reused.
-
std::optional<RetentionPriority> mSecondaryOffloadMinPriority#
Only blocks with priority > mSecondaryOfflineMinPriority can be offloaded to secondary memory.
-
size_t mEventBufferMaxSize#
Max size of the KV cache event buffer.
Friends
- friend class Serialization
-
struct KVCacheCreatedData#
Public Members
-
std::vector<SizeType32> numBlocksPerCacheLevel#
The amount of blocks at each cache level.
-
std::vector<SizeType32> numBlocksPerCacheLevel#
-
struct KVCacheEvent#
Public Functions
-
KVCacheEvent(IdType eventId, KVCacheEventData data)#
Public Members
-
KVCacheEventData data#
The data corresponding to this event.
-
KVCacheEvent(IdType eventId, KVCacheEventData data)#
-
template<typename T>
struct KVCacheEventDiff#
-
class KVCacheEventManager#
- #include <executor.h>
Exposes a limited set of KV cache manager functionalities.
Public Functions
- std::shared_ptr<tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager> kvCacheManager,
- std::deque<KVCacheEvent> getLatestEvents(
- std::optional<std::chrono::milliseconds> timeout = std::nullopt,
Get the latest KV Cache events.
- Parameters:
timeout – The maximum time to wait for new events. If nullopt, will only return when new events are available, or when the executor instance has shutdown.
Private Members
-
std::shared_ptr<tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager> kvCacheManager#
-
struct KVCacheRemovedData#
-
class KvCacheRetentionConfig#
- #include <executor.h>
Configuration for the request’s retention in the KV Cache.
Public Functions
-
inline explicit KvCacheRetentionConfig()#
- explicit KvCacheRetentionConfig(
- std::vector<TokenRangeRetentionConfig> const &tokenRangeRetentionPriorities,
- RetentionPriority decodeRetentionPriority = kDefaultRetentionPriority,
- std::optional<std::chrono::milliseconds> decodeDurationMs = std::nullopt,
- std::vector<TokenRangeRetentionConfig> getTokenRangeRetentionConfigs(
-
RetentionPriority getDecodeRetentionPriority() const#
- std::optional<std::chrono::milliseconds> getDecodeDurationMs(
- std::vector<RetentionPriorityAndDuration> getPerBlockRetentionPriorityDuration(
- SizeType32 blockSize,
- SizeType32 seqLen,
Convert the token range data into an entry per kv block. Returns a tuple of vectors corresponding to the priorities and durations for each block.
Public Static Attributes
-
static constexpr RetentionPriority kMinRetentionPriority = 0#
-
static constexpr RetentionPriority kMaxRetentionPriority = 100#
-
static constexpr RetentionPriority kDefaultRetentionPriority = 35#
Private Members
-
std::vector<TokenRangeRetentionConfig> mTokenRangeRetentionConfigs#
The token ranges and priority levels to update. Ranges must be non-overlapping. For example [(0, 64), (100, 128), (70, 80)] is valid, whereas [(0, 64), (60, 128)] is not.
-
RetentionPriority mDecodeRetentionPriority#
The priority level to assign to blocks allocated in the decode phase.
-
std::optional<std::chrono::milliseconds> mDecodeDurationMs#
The duration in ms that decode blocks should remain at their assigned priority level.
-
struct TokenRangeRetentionConfig#
- #include <executor.h>
A single entry to set block priorities over a token range. Earlier ranges always take priority over later ones. For example, with a block size of 16, a range of [0, 17] would be applied to the first two blocks.
Public Functions
- inline explicit TokenRangeRetentionConfig(
- SizeType32 tokenStart,
- std::optional<SizeType32> tokenEnd = std::nullopt,
- RetentionPriority priority = KvCacheRetentionConfig::kDefaultRetentionPriority,
- std::optional<std::chrono::milliseconds> durationMs = std::nullopt,
-
inline bool operator==(TokenRangeRetentionConfig const &other) const#
Public Members
-
SizeType32 tokenStart#
The first token of this range.
-
std::optional<SizeType32> tokenEnd#
The final token of this range. The end is not included in the range. This can be set to std::nullopt to extend the range to the end of the sequence.
-
RetentionPriority priority#
The priority of this token range. Higher priorities are less likely to be evicted or offloaded.
-
std::optional<std::chrono::milliseconds> durationMs#
The duration in ms that the block should remain at the given priority level. Set to std::nullopt to have no expiration time, and keep the block at the given priority level until it gets reclaimed. After the duration has passed, the block will be moved back to the
kDefaultRetentionPriority
level.
-
inline explicit KvCacheRetentionConfig()#
-
struct KVCacheStoredBlockData#
- #include <executor.h>
An entry for a single block stored into the tree.
Public Functions
- inline KVCacheStoredBlockData(
- IdType blockHash,
- tensorrt_llm::runtime::VecUniqueTokens tokens,
- tensorrt_llm::runtime::LoraTaskIdType loraId,
- SizeType32 cacheLevel,
- SizeType32 priority,
Public Members
-
tensorrt_llm::runtime::VecUniqueTokens tokens#
The unique tokens of the block.
-
tensorrt_llm::runtime::LoraTaskIdType loraId#
The Lora task id of the block.
-
SizeType32 cacheLevel#
The cache level of the block.
-
SizeType32 priority#
The priority of the block.
-
struct KVCacheStoredData#
Public Members
-
std::vector<KVCacheStoredBlockData> blocks#
A sequence of blocks. The parent of block
i
is blocki-1
-
std::vector<KVCacheStoredBlockData> blocks#
-
struct KVCacheUpdatedData#
Public Functions
- inline KVCacheUpdatedData &cacheLevelUpdated(
- SizeType32 oldValue,
- SizeType32 newValue,
- inline KVCacheUpdatedData &priorityUpdated(
- SizeType32 oldValue,
- SizeType32 newValue,
Public Members
-
std::optional<KVCacheEventDiff<SizeType32>> cacheLevel = std::nullopt#
The updated value of the cacheLevel field.
-
std::optional<KVCacheEventDiff<SizeType32>> priority = std::nullopt#
The updated value of the priority field.
-
class LogitsPostProcessorConfig#
Public Functions
- explicit LogitsPostProcessorConfig(
- std::optional<LogitsPostProcessorMap> processorMap = std::nullopt,
- std::optional<LogitsPostProcessorBatched> processorBatched = std::nullopt,
- bool replicate = true,
-
std::optional<LogitsPostProcessorMap> getProcessorMap() const#
- std::optional<LogitsPostProcessorBatched> getProcessorBatched(
-
bool getReplicate() const#
-
void setProcessorMap(LogitsPostProcessorMap const &processorMap)#
- void setProcessorBatched(
- LogitsPostProcessorBatched const &processorBatched,
-
void setReplicate(bool replicate)#
Private Members
-
std::optional<LogitsPostProcessorMap> mProcessorMap#
mapping from post processor names to non-batched post processors
-
std::optional<LogitsPostProcessorBatched> mProcessorBatched#
single batched post processor
-
bool mReplicate#
If set to true, logits post processor will run on all TP ranks in last PP rank.
-
struct LookaheadDecodingConfig#
Public Functions
- LookaheadDecodingConfig(
- SizeType32 windowSize,
- SizeType32 ngramSize,
- SizeType32 verificationSetSize,
-
inline explicit LookaheadDecodingConfig()#
-
bool operator==(LookaheadDecodingConfig const &other) const#
- std::tuple<SizeType32 const, SizeType32 const, SizeType32 const> get(
-
SizeType32 getWindowSize() const#
-
SizeType32 getNgramSize() const#
-
SizeType32 getVerificationSetSize() const#
- std::tuple<SizeType32, SizeType32, SizeType32, SizeType32> calculateSpeculativeResource(
return <maxDecodingTokens, maxPathLen, maxDraftTokens, maxDraftPathLen>
-
bool isLE(LookaheadDecodingConfig const &that) const#
return true when
this
can be executed on resources defined bythat
Public Static Functions
- static bool isLegal(
- SizeType32 windowSize,
- SizeType32 ngramSize,
- SizeType32 verificationSetSize,
return true when the parameter combination is valid.
Private Static Attributes
-
static constexpr SizeType32 kDefaultLookaheadDecodingWindow = 4#
-
static constexpr SizeType32 kDefaultLookaheadDecodingNgram = 3#
-
static constexpr SizeType32 kDefaultLookaheadDecodingVerificationSet = 4#
Friends
- friend class Serialization
-
class LoraConfig#
- #include <executor.h>
Configuration for LoRA.
Public Functions
Private Members
Friends
- friend class Serialization
-
class MropeConfig#
- #include <executor.h>
Configuration for mrope.
Public Functions
- explicit MropeConfig(
- Tensor mropeRoratySinCos,
- SizeType32 mropePositionDeltas,
-
SizeType32 getMRopePositionDeltas() const#
Private Members
-
Tensor mMRopeRotaryCosSin#
The mrope rotary sin and cos cache. Expected shape: [maxPositionEmbeddings*rotaryEmbeddingDim],Data type must float32.
-
SizeType32 mMRopePositionDeltas#
The mrope position deltas.
Friends
- friend class Serialization
-
class OrchestratorConfig#
Public Functions
- bool isOrchestrator = true,
- std::string workerExecutablePath = "",
- std::shared_ptr<mpi::MpiComm> orchLeaderComm = nullptr,
- bool spawnProcesses = true,
-
bool getIsOrchestrator() const#
-
std::string getWorkerExecutablePath() const#
-
bool getSpawnProcesses() const#
-
void setIsOrchestrator(bool isOrchestrator)#
-
void setWorkerExecutablePath(std::string const &workerExecutablePath)#
-
void setSpawnProcesses(bool spawnProcesses)#
-
class OutputConfig#
- #include <executor.h>
Configuration that controls the outputs of a Result.
Public Functions
- explicit OutputConfig(
- bool returnLogProbs = false,
- bool returnContextLogits = false,
- bool returnGenerationLogits = false,
- bool excludeInputFromOutput = false,
- bool returnEncoderOutput = false,
- bool returnPerfMetrics = false,
- std::optional<std::vector<AdditionalModelOutput>> additionalModelOutputs = std::nullopt,
Public Members
-
bool returnGenerationLogits#
Controls if Result should contain the generation logits. Default is false.
-
bool excludeInputFromOutput#
Controls if output tokens in Result should include the input tokens. Default is false.
-
bool returnEncoderOutput#
Controls if Result should contain encoder output hidden states (for encoder-only and encoder-decoder models). Default is false.
-
std::optional<std::vector<AdditionalModelOutput>> additionalModelOutputs#
The additional outputs to gather from the model.
-
class AdditionalModelOutput#
- #include <executor.h>
Additional output that should be gathered.
By default gather output of shape [beamWidth, x] from each generation phase. If gatherContext is true, also gather output of shape [promptLen, x] from context phase.
Public Functions
- explicit AdditionalModelOutput(
- std::string name,
- bool gatherContext = false,
-
class ParallelConfig#
- #include <executor.h>
A configuration class for the parallel execution parameters Currently only supports commType = CommunicationType::kMPI.
Public Functions
- explicit ParallelConfig(
- CommunicationType commType = CommunicationType::kMPI,
- CommunicationMode commMode = CommunicationMode::kLEADER,
- std::optional<std::vector<SizeType32>> deviceIds = std::nullopt,
- std::optional<std::vector<SizeType32>> participantIds = std::nullopt,
- std::optional<OrchestratorConfig> const &orchestratorConfig = std::nullopt,
Constructor.
- Parameters:
commType – The communication type. See CommunicationType.
commMode – The communication mode. See CommunicationMode.
deviceIds – The IDs of the GPUs involved in the execution of the model
participantIds – The participant IDs (MPI ranks if commType == kMPI) involved in the execution of the model. The first participant is considered to be the leader.
-
CommunicationType getCommunicationType() const#
-
CommunicationMode getCommunicationMode() const#
-
std::optional<std::vector<SizeType32>> getDeviceIds() const#
-
std::optional<std::vector<SizeType32>> getParticipantIds() const#
-
std::optional<OrchestratorConfig> getOrchestratorConfig() const#
-
void setCommunicationType(CommunicationType type)#
-
void setCommunicationMode(CommunicationMode mode)#
-
void setDeviceIds(std::vector<SizeType32> const &deviceIds)#
- void setParticipantIds(
- std::vector<SizeType32> const &participantIds,
- void setOrchestratorConfig(
- OrchestratorConfig const &orchestratorConfig,
Private Members
-
CommunicationType mCommType#
The type of communication protocol used. Default is MPI.
-
CommunicationMode mCommMode#
The mode of communication. See CommunicationMode.
-
std::optional<std::vector<SizeType32>> mDeviceIds#
The GPU device ids to use for executing this model.
-
std::optional<std::vector<SizeType32>> mParticipantIds#
The participant ids (MPI ranks for example) used for executing this model.
-
std::optional<OrchestratorConfig> mOrchestratorConfig#
Optional orchestrator configuration.
Friends
- friend class Serialization
-
class PeftCacheConfig#
- #include <executor.h>
config for PeftCacheManager
Public Functions
- explicit PeftCacheConfig(
- SizeType32 numHostModuleLayer = 0,
- SizeType32 numDeviceModuleLayer = 0,
- SizeType32 optimalAdapterSize = kDefaultOptimalAdapterSize,
- SizeType32 maxAdapterSize = kDefaultMaxAdapterSize,
- SizeType32 numPutWorkers = 1,
- SizeType32 numEnsureWorkers = 1,
- SizeType32 numCopyStreams = 1,
- SizeType32 maxPagesPerBlockHost = kDefaultMaxPagesPerBlockHost,
- SizeType32 maxPagesPerBlockDevice = kDefaultMaxPagesPerBlockDevice,
- std::optional<float> const &deviceCachePercent = std::nullopt,
- std::optional<size_t> const &hostCacheSize = std::nullopt,
-
bool operator==(PeftCacheConfig const &other) const#
-
SizeType32 getNumHostModuleLayer() const#
-
SizeType32 getNumDeviceModuleLayer() const#
-
SizeType32 getOptimalAdapterSize() const#
-
SizeType32 getMaxAdapterSize() const#
-
SizeType32 getNumPutWorkers() const#
-
SizeType32 getNumEnsureWorkers() const#
-
SizeType32 getNumCopyStreams() const#
-
SizeType32 getMaxPagesPerBlockHost() const#
-
SizeType32 getMaxPagesPerBlockDevice() const#
-
std::optional<float> getDeviceCachePercent() const#
-
std::optional<size_t> getHostCacheSize() const#
Public Static Attributes
-
static constexpr SizeType32 kDefaultOptimalAdapterSize = 8#
-
static constexpr SizeType32 kDefaultMaxAdapterSize = 64#
-
static constexpr SizeType32 kDefaultMaxPagesPerBlockHost = 24#
-
static constexpr SizeType32 kDefaultMaxPagesPerBlockDevice = 8#
Private Members
-
SizeType32 mNumHostModuleLayer#
-
SizeType32 mNumDeviceModuleLayer#
-
SizeType32 mOptimalAdapterSize#
-
SizeType32 mMaxAdapterSize#
-
SizeType32 mNumPutWorkers#
-
SizeType32 mNumEnsureWorkers#
-
SizeType32 mNumCopyStreams#
-
SizeType32 mMaxPagesPerBlockHost#
-
SizeType32 mMaxPagesPerBlockDevice#
-
std::optional<size_t> mHostCacheSize#
Friends
- friend class Serialization
-
class PromptTuningConfig#
- #include <executor.h>
Configuration for prompt tuning.
Public Functions
- explicit PromptTuningConfig(
- Tensor embeddingTable,
- std::optional<VecTokenExtraIds> inputTokenExtraIds = std::nullopt,
-
std::optional<VecTokenExtraIds> getInputTokenExtraIds() const#
Private Members
-
Tensor mEmbeddingTable#
The prompt embedding table. Expected shape: [task vocab_size, hidden_size]. Data type must match model weights.
-
std::optional<VecTokenExtraIds> mInputTokenExtraIds#
The input token extra ids for KV Cache reuse when p-tuning is enabled.
Friends
- friend class Serialization
-
class Request#
- #include <executor.h>
A class that holds information about the request.
Public Functions
- Request(
- VecTokens inputTokenIds,
- SizeType32 maxTokens,
- bool streaming = false,
- SamplingConfig const &samplingConfig = SamplingConfig(),
- OutputConfig const &outputConfig = OutputConfig(),
- std::optional<SizeType32> const &endId = std::nullopt,
- std::optional<SizeType32> const &padId = std::nullopt,
- std::optional<std::vector<SizeType32>> positionIds = std::nullopt,
- std::optional<std::list<VecTokens>> badWords = std::nullopt,
- std::optional<std::list<VecTokens>> stopWords = std::nullopt,
- std::optional<Tensor> embeddingBias = std::nullopt,
- std::optional<ExternalDraftTokensConfig> externalDraftTokensConfig = std::nullopt,
- std::optional<PromptTuningConfig> pTuningConfig = std::nullopt,
- std::optional<MropeConfig> mRopeConfig = std::nullopt,
- std::optional<LoraConfig> loraConfig = std::nullopt,
- std::optional<LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
- std::optional<KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
- std::optional<std::string> logitsPostProcessorName = std::nullopt,
- std::optional<VecTokens> encoderInputTokenIds = std::nullopt,
- std::optional<IdType> clientId = std::nullopt,
- bool returnAllGeneratedTokens = false,
- PriorityType priority = kDefaultPriority,
- RequestType type = RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION,
- std::optional<ContextPhaseParams> contextPhaseParams = std::nullopt,
- std::optional<Tensor> encoderInputFeatures = std::nullopt,
- std::optional<SizeType32> encoderOutputLength = std::nullopt,
- std::optional<Tensor> crossAttentionMask = std::nullopt,
- SizeType32 numReturnSequences = 1,
- std::optional<EagleConfig> eagleConfig = std::nullopt,
- std::optional<Tensor> skipCrossAttnBlocks = std::nullopt,
- std::optional<GuidedDecodingParams> guidedDecodingParams = std::nullopt,
- std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
The Request constructor.
- Parameters:
inputTokenIds – The input token ids
maxTokens – The maximum number of tokens to generate
streaming – Indicates if the responses should be streamed or not. Default is false.
samplingConfig – The sampling configuration
outputConfig – The output configuration
endId – The end token id
padId – The pad token id
positionIds – The input position ids
badWords – A list of bad words tokens. Each “word” can be composed of multiple tokens
stopWords – A list of stop words tokens. Each “word” can be composed of multiple tokens
embeddingBias – The embedding bias tensor. Expected shape is [vocab_size]
externalDraftTokensConfig – The speculative decoding with external draft tokens configuration
pTuningConfig – The prompt tuning configuration
loraConfig – The LoRA configuration
lookaheadConfig – The lookahead speculative decoding configuration
logitsPostProcessorName – The logits postprocessor name. Must correspond to one of the logits postprocessor
kvCacheRetentionConfig – The configuration used for KV cache block eviction. name provided to the ExecutorConfig.
encoderInputTokenIds – The encoder input token ids for encoder-decoder models, or encoder-only models
returnAllGeneratedTokens – Indicates whether to return the full beams or just the newly generated tokens after every streaming step.
priority – Sets the execution priority of this request.
encoderInputFeatures – Encoder input features for multimodal models.
encoderOutputLength – Encoder output length if encoder input and output have different lengths (due to convolution down-sampling, etc.)
crossAttentionMask – Cross attention mask.
type – Indicate the request type for disaggregated serving mode.
contextPhaseParams – Generated token ID from context only executor.
numReturnSequences – The number of returning sequences.
eagleConfig – The EAGLE speculative decoding configuration
skipCrossAttnBlocks – Skip the cross attention transformer blocks or not.
guidedDecodingParams – The guided decoding parameters.
allottedTimeMs – The allotted time in milliseconds after which the request is finished with a timedOut finish reason. The request always will exceed this time slightly, but at most with 1 forward pass. A request can be timed-out before ever being scheduled.
-
~Request()#
-
SizeType32 getMaxTokens() const#
-
SizeType32 getMaxNewTokens() const#
-
bool getStreaming() const#
-
SamplingConfig getSamplingConfig() const#
-
OutputConfig getOutputConfig() const#
-
std::optional<SizeType32> getEndId() const#
-
std::optional<SizeType32> getPadId() const#
-
std::optional<std::vector<SizeType32>> getPositionIds() const#
- std::optional<ExternalDraftTokensConfig> getExternalDraftTokensConfig(
-
std::optional<PromptTuningConfig> getPromptTuningConfig() const#
-
std::optional<MropeConfig> getMropeConfig() const#
-
std::optional<LoraConfig> getLoraConfig() const#
-
std::optional<LookaheadDecodingConfig> getLookaheadConfig() const#
- std::optional<KvCacheRetentionConfig> getKvCacheRetentionConfig(
-
std::optional<std::string> getLogitsPostProcessorName() const#
-
PriorityType getPriority() const#
-
bool getReturnAllGeneratedTokens() const#
- std::optional<ContextPhaseParams> const &getContextPhaseParams(
-
std::optional<SizeType32> getEncoderOutputLength() const#
-
RequestType getRequestType() const#
-
SizeType32 getNumReturnSequences() const#
-
std::optional<EagleConfig> getEagleConfig() const#
-
std::optional<GuidedDecodingParams> getGuidedDecodingParams() const#
-
std::optional<MillisecondsType> getAllottedTimeMs() const#
- std::optional<std::vector<std::string>> getAdditionalOutputNames(
-
void setStreaming(bool streaming)#
-
void setSamplingConfig(SamplingConfig const &config)#
-
void setOutputConfig(OutputConfig const &outputConfig)#
-
void setEndId(SizeType32 endId)#
-
void setPadId(SizeType32 padId)#
-
void setPositionIds(std::vector<SizeType32> const &positionIds)#
- void setExternalDraftTokensConfig(
- ExternalDraftTokensConfig const &externalDraftTokensConfig,
-
void setPromptTuningConfig(PromptTuningConfig const &pTuningConfig)#
-
void setMropeConfig(MropeConfig const &mRopeConfig)#
-
void setLoraConfig(LoraConfig const &loraConfig)#
- void setLookaheadConfig(
- LookaheadDecodingConfig const &lookaheadConfig,
- void setKvCacheRetentionConfig(
- KvCacheRetentionConfig const &kvCacheRetentionConfig,
- void setLogitsPostProcessorName(
- std::string const &logitsPostProcessorName,
-
void setPriority(PriorityType priority)#
-
void setReturnAllGeneratedTokens(bool returnAllGeneratedTokens)#
-
void setRequestType(RequestType const &requestType)#
-
void setContextPhaseParams(ContextPhaseParams contextPhaseParams)#
-
void setEncoderOutputLength(SizeType32 encoderOutputLength)#
-
void setNumReturnSequences(SizeType32 numReturnSequences)#
-
void setEagleConfig(std::optional<EagleConfig> const &eagleConfig)#
- void setGuidedDecodingParams(
- GuidedDecodingParams const &guidedDecodingParams,
-
void setAllottedTimeMs(MillisecondsType allottedTimeMs)#
- void setAdditionalOutputNames(
- std::optional<std::vector<std::string>> additionalOutputNames,
Public Static Attributes
-
static constexpr PriorityType kDefaultPriority = 0.5#
-
static auto constexpr kBatchedPostProcessorName = "batched"#
This logits postprocessor name will dispatch to the batched logits postprocessor.
Private Members
-
std::unique_ptr<Impl> mImpl#
Friends
- friend class Serialization
-
class Response#
- #include <executor.h>
Class that holds either an error or a result.
Public Functions
-
~Response()#
-
std::optional<IdType> getClientId() const#
Get the client id of the request for which this response was generated.
-
bool hasError() const#
Indicates if this response has an error or not.
-
std::string const &getErrorMsg() const#
Get the error msg for this response Will throw an exception if hasError is false.
Private Members
-
std::unique_ptr<Impl> mImpl#
Friends
- friend class Serialization
-
~Response()#
-
struct Result#
- #include <executor.h>
Struct that holds the generation result.
Public Members
-
bool isFinal#
Indicates if this is the final result for the request.
-
BeamTokens outputTokenIds#
The output tokens for each beam.
-
std::optional<VecLogProbs> cumLogProbs#
The cumulative log probabilities. Size beamSize.
-
std::optional<std::vector<VecLogProbs>> logProbs#
The log probabilities for each generated token. Size [beamSize, outputLen].
-
std::optional<Tensor> generationLogits#
The generation logits. Size [beamSize, maxNewTokens, vocabSizePadded] (non-streaming) or [maxNewTokens, beamSize, vocabSizePadded] (streaming and allGeneratedTokens) or [1, beamSize, vocabSizePadded] (streaming and non-allGeneratedTokens)
-
std::optional<SpeculativeDecodingFastLogitsInfo> specDecFastLogitsInfo#
Logits information for direct transfer when using fast logits.
-
std::vector<FinishReason> finishReasons#
The reason why the model stopped generating tokens for each beam in this request. Size [beamSize]. Currently only supported when beamSize is 1 and when using BatchingType::kINFLIGHT.
-
std::optional<ContextPhaseParams> contextPhaseParams#
The params of the context phase.
-
SizeType32 decodingIter = {0}#
The number of the decoding iterations used to generate the result. In autoregressive decoding, it is equal to the maximum length of the beam in outputTokenIds. In speculative decoding, might be less than maximum length of the beam in outputTokenIds as more than one token can be generated per iteration. Used for speculative decoding statistics.
-
SizeType32 sequenceIndex = {0}#
The index of the output sequence of this result where 0 <= sequenceIndex < numReturnSequences. In beam search (beamWidth > 1), this index will be always zero because all beams to be returned are included in this result.
-
bool isSequenceFinal#
Indicates if this is the final result for a given sequence in the request In beam search (beamWidth > 1), the value will always equal to the value of isFinal.
-
std::optional<RequestPerfMetrics> requestPerfMetrics#
Performance metrics if returnPerfMetrics is set in OutputConfig.
-
std::vector<AdditionalOutput> additionalOutputs#
The additional outputs.
-
bool isFinal#
-
struct RetentionPriorityAndDuration#
Public Functions
- inline RetentionPriorityAndDuration(
- std::optional<RetentionPriority> const &retentionPriority,
- std::optional<std::chrono::milliseconds> const &durationMs,
Public Members
-
std::optional<RetentionPriority> retentionPriority#
-
std::optional<std::chrono::milliseconds> durationMs#
-
class SamplingConfig#
- #include <executor.h>
Sampling configuration.
Public Functions
- explicit SamplingConfig(
- SizeType32 beamWidth = 1,
- std::optional<SizeType32> const &topK = std::nullopt,
- std::optional<FloatType> const &topP = std::nullopt,
- std::optional<FloatType> const &topPMin = std::nullopt,
- std::optional<TokenIdType> const &topPResetIds = std::nullopt,
- std::optional<FloatType> const &topPDecay = std::nullopt,
- std::optional<RandomSeedType> const &seed = std::nullopt,
- std::optional<FloatType> const &temperature = std::nullopt,
- std::optional<SizeType32> const &minTokens = std::nullopt,
- std::optional<FloatType> const &beamSearchDiversityRate = std::nullopt,
- std::optional<FloatType> const &repetitionPenalty = std::nullopt,
- std::optional<FloatType> const &presencePenalty = std::nullopt,
- std::optional<FloatType> const &frequencyPenalty = std::nullopt,
- std::optional<FloatType> const &lengthPenalty = std::nullopt,
- std::optional<SizeType32> const &earlyStopping = std::nullopt,
- std::optional<SizeType32> const &noRepeatNgramSize = std::nullopt,
- std::optional<SizeType32> const &numReturnSequences = std::nullopt,
Constructor for SamplingConfig See description of parameters below.
-
bool operator==(SamplingConfig const &other) const#
-
SizeType32 getBeamWidth() const#
-
SizeType32 getNumReturnBeams() const#
-
std::optional<SizeType32> getTopK() const#
-
std::optional<SizeType32> getTopPResetIds() const#
-
std::optional<RandomSeedType> getSeed() const#
-
std::optional<RandomSeedType> getRandomSeed() const#
-
std::optional<SizeType32> getMinTokens() const#
-
std::optional<SizeType32> getMinLength() const#
-
std::optional<SizeType32> getEarlyStopping() const#
-
std::optional<SizeType32> getNoRepeatNgramSize() const#
-
std::optional<SizeType32> getNumReturnSequences() const#
-
void setBeamWidth(SizeType32 beamWidth)#
-
void setTopK(std::optional<SizeType32> const &topK)#
- void setTopPResetIds(
- std::optional<TokenIdType> const &topPResetIds,
-
void setSeed(std::optional<RandomSeedType> const &seed)#
-
void setRandomSeed(std::optional<RandomSeedType> const &randomSeed)#
-
void setMinTokens(std::optional<SizeType32> const &minTokens)#
-
void setMinLength(std::optional<SizeType32> const &minLength)#
- void setEarlyStopping(
- std::optional<SizeType32> const &earlyStopping,
- void setNoRepeatNgramSize(
- std::optional<SizeType32> const &noRepeatNgramSize,
- void setNumReturnSequences(
- std::optional<SizeType32> const &numReturnSequences,
Private Functions
-
void updateNumReturnBeams()#
Private Members
-
SizeType32 mBeamWidth#
The beam width. Default is 1 which disables beam search.
-
std::optional<SizeType32> mTopK#
Controls number of logits to sample from. Default is 0 (all logits).
-
std::optional<FloatType> mTopPMin#
Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6.
-
std::optional<TokenIdType> mTopPResetIds#
Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1.
-
std::optional<FloatType> mTopPDecay#
Controls decay in the top-P algorithm. The decay value. Default is 1.f.
-
std::optional<RandomSeedType> mSeed#
Controls the random seed used by the random number generator in sampling.
-
std::optional<FloatType> mTemperature#
Controls the modulation of logits when sampling new tokens. It can have values > 0.f. Default is 1.0f.
-
std::optional<SizeType32> mMinTokens#
Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1.
-
std::optional<FloatType> mRepetitionPenalty#
Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. Default is 1.f.
-
std::optional<FloatType> mPresencePenalty#
Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.
-
std::optional<FloatType> mFrequencyPenalty#
Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.
-
std::optional<FloatType> mLengthPenalty#
Controls how to penalize longer sequences in beam search. Default is 0.f.
-
std::optional<SizeType32> mEarlyStopping#
Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token)
-
std::optional<SizeType32> mNoRepeatNgramSize#
Controls how many repeat ngram size are acceptable. Default is 1 << 30.
-
std::optional<SizeType32> mNumReturnSequences#
The number of return sequences or beams. In beam search, the value should be less than or equal to mBeamWidth. In sampling, it specifies the total number of independently generated sequences.
-
SizeType32 mNumReturnBeams#
The number of beams to return. It is equal to beamWidth unless numReturnSequences is set. If beamWidth > 1 and numReturnSequences is set, then numReturnBeams is equal to numReturnSequences.
Private Static Functions
-
static SizeType32 checkBeamWidth(SizeType32 beamWidth)#
- static std::optional<TokenIdType> const &checkTopPResetIds(
- std::optional<TokenIdType> const &topPResetIds,
- static std::optional<FloatType> const &checkTopPDecay(
- std::optional<FloatType> const &topPDecay,
- static std::optional<FloatType> const &checkTemperature(
- std::optional<FloatType> const &temperature,
- static std::optional<FloatType> const &checkRepetitionPenalty(
- std::optional<FloatType> const &penalty,
- static std::optional<SizeType32> const &checkMinTokens(
- std::optional<SizeType32> const &minTokens,
- static std::optional<SizeType32> const &checkNoRepeatNgramSize(
- std::optional<SizeType32> const &noRepeatNgramSize,
- static std::optional<FloatType> const &checkBeamSearchDiversityRate(
- std::optional<FloatType> const &beamSearchDiversityRate,
- static std::optional<SizeType32> const &checkNumReturnSequences(
- std::optional<SizeType32> const &numReturnSequences,
- SizeType32 beamWidth,
Friends
- friend class Serialization
-
class SchedulerConfig#
- #include <executor.h>
Configuration class for the scheduler.
Public Functions
- explicit SchedulerConfig(
- CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT,
- std::optional<ContextChunkingPolicy> contextChunkingPolicy = std::nullopt,
- std::optional<DynamicBatchConfig> dynamicBatchConfig = std::nullopt,
-
bool operator==(SchedulerConfig const &other) const#
-
CapacitySchedulerPolicy getCapacitySchedulerPolicy() const#
- std::optional<ContextChunkingPolicy> getContextChunkingPolicy(
-
std::optional<DynamicBatchConfig> getDynamicBatchConfig() const#
Private Members
-
CapacitySchedulerPolicy mCapacitySchedulerPolicy#
The capacity scheduler policy. See CapacitySchedulerPolicy.
-
std::optional<ContextChunkingPolicy> mContextChunkingPolicy#
The context chunking policy. See ContextChunkingPolicy.
-
std::optional<DynamicBatchConfig> mDynamicBatchConfig#
The config for tuning batch size dynamically. See DynamicBatchSizeConfig.
Friends
- friend class Serialization
-
class SpeculativeDecodingConfig#
- #include <executor.h>
Configuration for speculative decoding (both draft and target models)
Public Functions
-
explicit SpeculativeDecodingConfig(bool fastLogits = false)#
-
bool operator==(SpeculativeDecodingConfig const &other) const#
Public Members
-
bool fastLogits#
Send logits tensor directly from draft to target model.
-
explicit SpeculativeDecodingConfig(bool fastLogits = false)#
-
using RetentionPriority = SizeType32#
-
namespace mpi#
-
namespace executor
types.h#
-
namespace tensorrt_llm
-
namespace executor
Typedefs
-
using SizeType32 = std::int32_t#
-
using FloatType = float#
-
using TokenIdType = std::int32_t#
-
using VecTokens = std::vector<TokenIdType>#
-
using IdType = std::uint64_t#
-
using IterationType = std::uint64_t#
-
using RandomSeedType = std::uint64_t#
-
using StreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>#
-
using MillisecondsType = std::chrono::milliseconds#
-
using LogitsPostProcessor = std::function<void(IdType, Tensor&, BeamTokens const&, StreamPtr const&, std::optional<IdType>)>#
-
using LogitsPostProcessorMap = std::unordered_map<std::string, LogitsPostProcessor>#
-
using LogitsPostProcessorBatched = std::function<void(std::vector<IdType> const&, std::vector<Tensor>&, std::vector<std::reference_wrapper<BeamTokens const>> const&, StreamPtr const&, std::vector<std::optional<IdType>> const&)>#
-
using MedusaChoices = std::vector<std::vector<SizeType32>>#
-
using EagleChoices = std::vector<std::vector<SizeType32>>#
-
using PriorityType = float#
-
using BufferView = std::basic_string_view<uint8_t>#
Enums
-
enum class DataType#
Values:
-
enumerator kBOOL#
-
enumerator kUINT8#
-
enumerator kINT8#
-
enumerator kINT32#
-
enumerator kINT64#
-
enumerator kBF16#
-
enumerator kFP8#
-
enumerator kFP16#
-
enumerator kFP32#
-
enumerator kUNKNOWN#
-
enumerator kBOOL#
-
enum class RequestType#
Values:
-
enumerator REQUEST_TYPE_CONTEXT_AND_GENERATION#
-
enumerator REQUEST_TYPE_CONTEXT_ONLY#
-
enumerator REQUEST_TYPE_GENERATION_ONLY#
-
enumerator REQUEST_TYPE_CONTEXT_AND_GENERATION#
-
enum class MemoryType#
Values:
-
enumerator kCPU#
-
enumerator kCPU_PINNED#
-
enumerator kCPU_PINNEDPOOL#
-
enumerator kGPU#
-
enumerator kUVM#
-
enumerator kUNKNOWN#
-
enumerator kCPU#
-
enum class ModelType#
Values:
-
enumerator kDECODER_ONLY#
-
enumerator kENCODER_ONLY#
-
enumerator kENCODER_DECODER#
-
enumerator kDECODER_ONLY#
-
enum class BatchingType#
The batching type.
Values:
-
enumerator kSTATIC#
STATIC refers to the traditional batching scheme with a batch of requests running in lockstep until the full generation for all of them is complete. Requests in a batch are all padded up to the maximum input and output sequence length of any member of the batch.
-
enumerator kINFLIGHT#
INFLIGHT refers to a scheme where newly arrived requests are dynamically incorporated into the batch under execution, and requests are returned as soon as the end condition is met without any padding.
-
enumerator kSTATIC#
-
enum class CapacitySchedulerPolicy#
The policy used to select the subset of available requests in each iteration of the executor generation loop.
Values:
-
enumerator kMAX_UTILIZATION#
MAX_UTILIZATION packs as many requests as the underlying TRT engine can support in any iteration of the InflightBatching generation loop. While this is expected to maximize GPU throughput, it might require that some requests be paused and restarted depending on peak KV cache memory availability.
-
enumerator kGUARANTEED_NO_EVICT#
GUARANTEED_NO_EVICT uses KV cache more conservatively guaranteeing that a request, once started, will run to completion without eviction.
-
enumerator kSTATIC_BATCH#
kSTATIC_BATCH does not schedule new requests until all requests in current batch are completed. Similar to kGUARANTEED_NO_EVICT, requests will run to completion without eviction.
-
enumerator kMAX_UTILIZATION#
-
enum class ContextChunkingPolicy#
Values:
-
enumerator kFIRST_COME_FIRST_SERVED#
Sequential chunking, complete the unfinished context phase first.
-
enumerator kEQUAL_PROGRESS#
Iterate through each context request in sequence and attempt to increase its chunk count until the constraint is exceeded.
-
enumerator kFIRST_COME_FIRST_SERVED#
-
enum class RequestStage#
Enum class that represents the state of a request.
Values:
-
enumerator kQUEUED#
Request that have been received but not yet included in the active requests (due to constraints such as maximum batch size for example).
-
enumerator kENCODER_IN_PROGRESS#
Active request in encoder phase.
-
enumerator kCONTEXT_IN_PROGRESS#
Active request in context phase.
-
enumerator kGENERATION_IN_PROGRESS#
Active request in generation phase.
-
enumerator kGENERATION_COMPLETE#
Active request for which generation has completed.
-
enumerator kQUEUED#
-
enum class FinishReason#
The reason why the model stopped generating tokens for a request.
Values:
-
enumerator kNOT_FINISHED#
The request is not finished.
-
enumerator kEND_ID#
The request finished because the end id was generated.
-
enumerator kSTOP_WORDS#
The request finished because a stop word was generated.
-
enumerator kLENGTH#
The request finished because the maximum number of tokens was reached.
-
enumerator kTIMED_OUT#
The request finished because it got timed out (via the mAllotedTime parameter)
-
enumerator kCANCELLED#
The request was cancelled by calling cancelRequest.
-
enumerator kNOT_FINISHED#
Functions
- std::ostream &operator<<(
- std::ostream &os,
- CapacitySchedulerPolicy policy,
- std::ostream &operator<<(
- std::ostream &os,
- ContextChunkingPolicy policy,
-
struct DebugTensorsPerIteration#
- #include <types.h>
Struct that holds the debug tensors in an iteration.
Public Members
-
IterationType iter#
The iteration id for these tensors.
-
IterationType iter#
-
class DecodingMode#
- #include <types.h>
mode of the decoder
Public Types
-
using UnderlyingType = uint32_t#
Public Functions
-
inline auto constexpr useTemperature(bool useTemp)#
-
inline auto constexpr useOccurrencePenalties(bool usePenalty)#
-
inline auto constexpr usePresencePenalty(bool usePenalty)#
-
inline auto constexpr useRepetitionPenalty(bool usePenalty)#
-
inline auto constexpr useFrequencyPenalty(bool usePenalty)#
-
inline auto constexpr useMinLength(bool useMinLen)#
-
inline auto constexpr useBanTokens(bool banTokens)#
-
inline auto constexpr useBanWords(bool banWords)#
-
inline auto constexpr useNoRepeatNgramSize(bool noRepeatNgramSize)#
-
inline auto constexpr useStopWords(bool stopWords)#
-
inline auto constexpr useMaxLengthStop(bool maxLengthStop)#
-
inline auto constexpr useExplicitEosStop(bool explicitEosStop)#
-
inline bool constexpr isAuto() const#
-
inline bool constexpr isTopK() const#
-
inline bool constexpr isTopP() const#
-
inline bool constexpr isTopKorTopP() const#
-
inline bool constexpr isTopKandTopP() const#
-
inline bool constexpr isBeamSearch() const#
-
inline bool constexpr isMedusa() const#
-
inline bool constexpr isLookahead() const#
-
inline bool constexpr isExplicitDraftTokens() const#
-
inline bool constexpr isExternalDraftTokens() const#
-
inline bool constexpr isEagle() const#
-
inline bool constexpr isUseTemperature() const#
-
inline bool constexpr isUsePresencePenalty() const#
-
inline bool constexpr isUseFrequencyPenalty() const#
-
inline bool constexpr isUseRepetitionPenalty() const#
-
inline bool constexpr isUseMinLength() const#
-
inline bool constexpr isUseOccurrencePenalty() const#
-
inline bool constexpr isUsePenalty() const#
-
inline bool constexpr isUseBanWords() const#
-
inline bool constexpr isUseNoRepeatNgramSize() const#
-
inline bool constexpr isUseBanTokens() const#
-
inline bool constexpr isUseStopWords() const#
-
inline bool constexpr isUseMaxLengthStop() const#
-
inline bool constexpr isUseExplicitEosStop() const#
-
inline bool constexpr isUseStopCriteria() const#
-
inline bool operator==(DecodingMode const &other) const#
-
inline explicit constexpr DecodingMode(UnderlyingType state)#
-
inline constexpr UnderlyingType getState() const#
Public Static Functions
-
static inline auto constexpr Auto()#
No mode specified. Config will be determined from the beam width of the first request at runtime TopKTopP if beamWidth == 1, BeamSearch otherwise.
-
static inline auto constexpr TopK()#
-
static inline auto constexpr TopP()#
-
static inline auto constexpr TopKTopP()#
-
static inline auto constexpr BeamSearch()#
-
static inline auto constexpr Medusa()#
-
static inline auto constexpr Lookahead()#
-
static inline auto constexpr ExplicitDraftTokens()#
-
static inline auto constexpr ExternalDraftTokens()#
-
static inline auto constexpr Eagle()#
Private Functions
-
inline bool constexpr anyBitSet(UnderlyingType bits) const#
-
inline bool constexpr allBitSet(UnderlyingType bits) const#
- inline UnderlyingType constexpr setBitTo(
- UnderlyingType state,
- bool x,
Private Members
-
UnderlyingType mState = {}#
Private Static Attributes
-
static UnderlyingType constexpr kUseRepetitionPenalties = {1u << 0}#
-
static UnderlyingType constexpr kUseFrequencyPenalties = {1u << 1}#
-
static UnderlyingType constexpr kUsePresencePenalties = {1u << 2}#
-
static UnderlyingType constexpr kUseTemperature = {1u << 3}#
-
static UnderlyingType constexpr kUseMinLength = {1u << 4}#
-
static UnderlyingType constexpr kUseBanWords = {1u << 5}#
-
static UnderlyingType constexpr kUseStopWords = {1u << 6}#
-
static UnderlyingType constexpr kUseMaxLengthStop = {1u << 7}#
-
static UnderlyingType constexpr kUseExplicitEosStop = {1u << 8}#
-
static UnderlyingType constexpr kUseNoRepeatNgramSize = {1u << 9}#
-
static UnderlyingType constexpr kStandardStopCriteria = {kUseStopWords | kUseMaxLengthStop}#
-
static UnderlyingType constexpr kUseOccurrencePenalties{kUseRepetitionPenalties | kUseFrequencyPenalties | kUsePresencePenalties}#
-
static UnderlyingType constexpr kUsePenalties = {kUseOccurrencePenalties | kUseTemperature | kUseMinLength}#
-
static UnderlyingType constexpr kUseBanTokens = {kUseNoRepeatNgramSize | kUseBanWords}#
-
static SizeType32 constexpr kNumFlags = {10}#
-
static UnderlyingType constexpr kAuto = {1u << (kNumFlags + 0)}#
-
static UnderlyingType constexpr kTopK = {1u << (kNumFlags + 1)}#
-
static UnderlyingType constexpr kTopP = {1u << (kNumFlags + 2)}#
-
static UnderlyingType constexpr kBeamSearch = {1u << (kNumFlags + 3)}#
-
static UnderlyingType constexpr kMedusa = {1u << (kNumFlags + 4)}#
-
static UnderlyingType constexpr kLookahead = {1u << (kNumFlags + 5)}#
-
static UnderlyingType constexpr kExplicitDraftTokens = {1u << (kNumFlags + 6)}#
-
static UnderlyingType constexpr kExternalDraftTokens = {1u << (kNumFlags + 7)}#
-
static UnderlyingType constexpr kEagle = {1u << (kNumFlags + 8)}#
-
static UnderlyingType constexpr kTopKTopP = {kTopK | kTopP}#
-
using UnderlyingType = uint32_t#
-
struct DisServingRequestStats#
- #include <types.h>
Struct that holds the request stats in the case of disaggregated serving.
Public Members
-
double kvCacheTransferMS#
The total time spent on transferring KV cache from context phase to generation phase (ms)
-
double kvCacheTransferMS#
-
struct InflightBatchingStats#
- #include <types.h>
Struct that holds the stats of inflight batching models for a single iteration.
Public Members
-
SizeType32 numScheduledRequests#
Number of scheduled requests.
-
SizeType32 numContextRequests#
Number of requests in context stage.
-
SizeType32 numGenRequests#
Number of requests in generation stage.
-
SizeType32 numPausedRequests#
Number of paused requests.
-
SizeType32 numCtxTokens#
Total number of context tokens in the iteration.
-
SizeType32 microBatchId#
Index of mirco batch.
-
float avgNumDecodedTokensPerIter#
Average number of tokens decoded per request per iteration.
-
SizeType32 numScheduledRequests#
-
struct IterationStats#
- #include <types.h>
Struct that holds the stats of a single iteration.
Public Members
-
std::string timestamp#
Ending time of this iteration.
-
IterationType iter#
Iteration id.
-
double iterLatencyMS#
Iteration latency (ms)
-
double newActiveRequestsQueueLatencyMS#
The total time spent in queue by the requests that became active in this iteration (ms)
-
SizeType32 numNewActiveRequests#
Number of new fetched active requests.
-
SizeType32 numActiveRequests#
Number of active requests.
-
SizeType32 numQueuedRequests#
Number of queued requests.
-
SizeType32 numCompletedRequests#
Number of requests that were completed in this iteration.
-
SizeType32 maxNumActiveRequests#
Number of max active requests.
-
SizeType32 maxBatchSizeStatic#
Static max batch size passed to the executor.
-
SizeType32 maxBatchSizeTunerRecommended#
Batch size produced by dynamic tuner based on input stats.
-
SizeType32 maxBatchSizeRuntime#
@brife The min of maxBatchSizeStatic and maxBatchSizeRuntimeUpperbound
-
SizeType32 maxNumTokensStatic#
@brife Static max num tokens passed to the executor
-
SizeType32 maxNumTokensTunerRecommended#
@brife Max num tokens produced by dynamic tuner based on input stats
-
SizeType32 maxNumTokensRuntime#
@brife The runtime max num tokens
-
size_t gpuMemUsage#
GPU memory usage in bytes.
-
size_t cpuMemUsage#
CPU memory usage in bytes.
-
size_t pinnedMemUsage#
Pinned memory usage in bytes.
-
std::optional<KvCacheStats> kvCacheStats#
Stats specific to KV caches.
-
std::optional<KvCacheStats> crossKvCacheStats#
Stats specific to cross KV caches.
-
std::optional<StaticBatchingStats> staticBatchingStats#
Stats specific to static batching.
-
std::optional<InflightBatchingStats> inflightBatchingStats#
Stats specific to inflight batching.
-
std::string timestamp#
-
struct KvCacheStats#
- #include <types.h>
Struct that holds the stats of a KV cache manager.
Public Members
-
SizeType32 maxNumBlocks#
Max number of blocks.
-
SizeType32 freeNumBlocks#
Number of free blocks.
-
SizeType32 usedNumBlocks#
Number of used blocks.
-
SizeType32 tokensPerBlock#
Number of tokens per block.
-
SizeType32 allocTotalBlocks#
Number of total allocated block.
-
SizeType32 allocNewBlocks#
Number of newly allocated block.
-
SizeType32 reusedBlocks#
Number of reused block.
-
SizeType32 missedBlocks#
Number of not reused block.
-
float cacheHitRate#
Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).
-
SizeType32 maxNumBlocks#
-
struct RequestPerfMetrics#
- #include <types.h>
Struct that holds the stats of a request.
Public Types
-
using TimePoint = std::chrono::time_point<std::chrono::steady_clock>#
Public Members
-
TimingMetrics timingMetrics#
-
KvCacheMetrics kvCacheMetrics#
-
std::optional<IterationType> firstIter#
First iteration where the request was processed.
-
std::optional<IterationType> lastIter#
Last iteration where a token was generated.
-
std::optional<IterationType> iter#
Current iteration.
-
struct KvCacheMetrics#
Public Members
-
SizeType32 numTotalAllocatedBlocks = {0}#
Number of total allocated blocks.
-
SizeType32 numNewAllocatedBlocks = {0}#
Number of newly allocated blocks.
-
SizeType32 numReusedBlocks = {0}#
Number of reused blocks.
-
SizeType32 numMissedBlocks = {0}#
Number of missed blocks.
-
SizeType32 kvCacheHitRate = {0}#
KV Cache Hit Rate, defined as reusedBlocks / (reusedBlocks + missedBlocks)
-
SizeType32 numTotalAllocatedBlocks = {0}#
-
struct TimingMetrics#
Public Members
-
using TimePoint = std::chrono::time_point<std::chrono::steady_clock>#
-
struct RequestStats#
- #include <types.h>
Struct that holds the stats of a single request.
Public Members
-
RequestStage stage#
The current stage the request is in.
-
SizeType32 contextPrefillPosition#
If using chunked context, the current context prefill position.
-
SizeType32 numGeneratedTokens#
The number of generated tokens so far.
-
float avgNumDecodedTokensPerIter#
The average number of decoded tokens per iteration. It is >= 1 for speculative decoding.
-
bool scheduled#
Whether the request is scheduled for the current iteration.
-
bool paused#
Whether the request is being paused at the current iteration due to lack of resources (KV cache blocks exhaustion for example)
-
std::optional<DisServingRequestStats> disServingStats#
Stats specific to disaggregated serving.
-
SizeType32 allocTotalBlocksPerRequest#
Number of total allocated blocks per request.
-
SizeType32 allocNewBlocksPerRequest#
Number of newly allocated blocks per request.
-
SizeType32 reusedBlocksPerRequest#
Number of reused blocks per request.
-
SizeType32 missedBlocksPerRequest#
Number of missed blocks per request.
-
SizeType32 kvCacheHitRatePerRequest#
KV Cache Hit Rate per request, defined as reusedBlocks / (reusedBlocks + missedBlocks)
-
RequestStage stage#
-
struct RequestStatsPerIteration#
- #include <types.h>
Struct that holds the stats of all requests in an iteration.
Public Members
-
IterationType iter#
The iteration id for these stats.
-
std::vector<RequestStats> requestStats#
The stats of all active requests for this iteration.
-
IterationType iter#
-
struct StaticBatchingStats#
- #include <types.h>
Struct that holds the stats of static batching models for a single iteration.
Public Members
-
SizeType32 numScheduledRequests#
Number of scheduled requests.
-
SizeType32 numContextRequests#
Number of requests in context stage.
-
SizeType32 numCtxTokens#
Total number of context tokens in the iteration.
-
SizeType32 numGenTokens#
Total number of tokens to generate in the iteration.
-
SizeType32 emptyGenSlots#
Total number of unused generation token slots.
-
SizeType32 numScheduledRequests#
-
template<typename T, bool = false>
struct TypeTraits# - #include <types.h>
For converting a C++ data type to a
TrtLmmDataType
.
-
template<>
struct TypeTraits<bool>#
-
template<>
struct TypeTraits<float>#
-
template<>
struct TypeTraits<half>#
-
template<>
struct TypeTraits<std::int32_t>#
-
template<>
struct TypeTraits<std::int64_t>#
-
template<>
struct TypeTraits<std::int8_t>#
-
template<>
struct TypeTraits<std::uint8_t>#
-
using SizeType32 = std::int32_t#
-
namespace runtime#
-
namespace executor
serialization.h#
-
namespace tensorrt_llm
-
namespace executor
-
class Serialization#
Public Static Functions
- static RequestPerfMetrics::TimePoint deserializeTimePoint(
- std::istream &is,
- static void serialize(
- RequestPerfMetrics::TimePoint const &tp,
- std::ostream &os,
-
static size_t serializedSize(RequestPerfMetrics::TimePoint const&)#
- static RequestPerfMetrics deserializeRequestPerfMetrics(
- std::istream &is,
- static void serialize(
- RequestPerfMetrics const &metrics,
- std::ostream &os,
-
static size_t serializedSize(RequestPerfMetrics const &metrics)#
-
static SamplingConfig deserializeSamplingConfig(std::istream &is)#
-
static void serialize(SamplingConfig const &config, std::ostream &os)#
-
static size_t serializedSize(SamplingConfig const &config)#
-
static OutputConfig deserializeOutputConfig(std::istream &is)#
-
static void serialize(OutputConfig const &config, std::ostream &os)#
-
static size_t serializedSize(OutputConfig const &config)#
- static OutputConfig::AdditionalModelOutput deserializeAdditionalModelOutput(
- std::istream &is,
- static void serialize(
- OutputConfig::AdditionalModelOutput const &additionalModelOutput,
- std::ostream &os,
- static size_t serializedSize(
- OutputConfig::AdditionalModelOutput const &additionalModelOutput,
- static ExternalDraftTokensConfig deserializeExternalDraftTokensConfig(
- std::istream &is,
- static void serialize(
- ExternalDraftTokensConfig const &config,
- std::ostream &os,
-
static size_t serializedSize(ExternalDraftTokensConfig const &config)#
- static PromptTuningConfig deserializePromptTuningConfig(
- std::istream &is,
- static void serialize(
- PromptTuningConfig const &config,
- std::ostream &os,
-
static size_t serializedSize(PromptTuningConfig const &config)#
-
static MropeConfig deserializeMropeConfig(std::istream &is)#
-
static void serialize(MropeConfig const &config, std::ostream &os)#
-
static size_t serializedSize(MropeConfig const &config)#
-
static LoraConfig deserializeLoraConfig(std::istream &is)#
-
static void serialize(LoraConfig const &config, std::ostream &os)#
-
static size_t serializedSize(LoraConfig const &config)#
- static DataTransceiverState deserializeDataTransceiverState(
- std::istream &is,
- static void serialize(
- DataTransceiverState const &dataTransceiverState,
- std::ostream &os,
- static size_t serializedSize(
- DataTransceiverState const &dataTransceiverState,
- static ContextPhaseParams deserializeContextPhaseParams(
- std::istream &is,
- static void serialize(
- ContextPhaseParams const &contextPhaseParams,
- std::ostream &os,
- static size_t serializedSize(
- ContextPhaseParams const &contextPhaseParams,
- static SpeculativeDecodingFastLogitsInfo deserializeSpecDecFastLogitsInfo(
- std::istream &is,
- static void serialize(
- SpeculativeDecodingFastLogitsInfo const &info,
- std::ostream &os,
- static size_t serializedSize(
- SpeculativeDecodingFastLogitsInfo const &info,
-
static AdditionalOutput deserializeAdditionalOutput(std::istream &is)#
- static void serialize(
- AdditionalOutput const &additionalOutput,
- std::ostream &os,
- static size_t serializedSize(
- AdditionalOutput const &additionalOutput,
-
static KvCacheConfig deserializeKvCacheConfig(std::istream &is)#
- static void serialize(
- KvCacheConfig const &kvCacheConfig,
- std::ostream &os,
-
static size_t serializedSize(KvCacheConfig const &kvCacheConfig)#
- static DynamicBatchConfig deserializeDynamicBatchConfig(
- std::istream &is,
- static void serialize(
- DynamicBatchConfig const &dynamicBatchConfig,
- std::ostream &os,
- static size_t serializedSize(
- DynamicBatchConfig const &dynamicBatchConfig,
-
static SchedulerConfig deserializeSchedulerConfig(std::istream &is)#
- static void serialize(
- SchedulerConfig const &schedulerConfig,
- std::ostream &os,
-
static size_t serializedSize(SchedulerConfig const &schedulerConfig)#
- static ExtendedRuntimePerfKnobConfig deserializeExtendedRuntimePerfKnobConfig(
- std::istream &is,
- static void serialize(
- ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig,
- std::ostream &os,
- static size_t serializedSize(
- ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig,
-
static ParallelConfig deserializeParallelConfig(std::istream &is)#
- static void serialize(
- ParallelConfig const ¶llelConfig,
- std::ostream &os,
-
static size_t serializedSize(ParallelConfig const ¶llelConfig)#
-
static PeftCacheConfig deserializePeftCacheConfig(std::istream &is)#
- static void serialize(
- PeftCacheConfig const &peftCacheConfig,
- std::ostream &os,
-
static size_t serializedSize(PeftCacheConfig const &peftCacheConfig)#
- static OrchestratorConfig deserializeOrchestratorConfig(
- std::istream &is,
- static void serialize(
- OrchestratorConfig const &orchestratorConfig,
- std::ostream &os,
- static size_t serializedSize(
- OrchestratorConfig const &orchestratorConfig,
-
static DecodingMode deserializeDecodingMode(std::istream &is)#
- static void serialize(
- DecodingMode const &decodingMode,
- std::ostream &os,
-
static size_t serializedSize(DecodingMode const &decodingMode)#
- static LookaheadDecodingConfig deserializeLookaheadDecodingConfig(
- std::istream &is,
- static void serialize(
- LookaheadDecodingConfig const &lookaheadDecodingConfig,
- std::ostream &os,
- static size_t serializedSize(
- LookaheadDecodingConfig const &lookaheadDecodingConfig,
-
static EagleConfig deserializeEagleConfig(std::istream &is)#
- static void serialize(
- EagleConfig const &eagleConfig,
- std::ostream &os,
-
static size_t serializedSize(EagleConfig const &eagleConfig)#
- static SpeculativeDecodingConfig deserializeSpeculativeDecodingConfig(
- std::istream &is,
- static void serialize(
- SpeculativeDecodingConfig const &specDecConfig,
- std::ostream &os,
- static size_t serializedSize(
- SpeculativeDecodingConfig const &specDecConfig,
- static GuidedDecodingConfig deserializeGuidedDecodingConfig(
- std::istream &is,
- static void serialize(
- GuidedDecodingConfig const &guidedDecodingConfig,
- std::ostream &os,
- static size_t serializedSize(
- GuidedDecodingConfig const &guidedDecodingConfig,
- static GuidedDecodingParams deserializeGuidedDecodingParams(
- std::istream &is,
- static void serialize(
- GuidedDecodingParams const &guidedDecodingParams,
- std::ostream &os,
- static size_t serializedSize(
- GuidedDecodingParams const &guidedDecodingParams,
- static KvCacheRetentionConfig deserializeKvCacheRetentionConfig(
- std::istream &is,
- static void serialize(
- KvCacheRetentionConfig const &kvCacheRetentionConfig,
- std::ostream &os,
- static size_t serializedSize(
- KvCacheRetentionConfig const &kvCacheRetentionConfig,
- static KvCacheRetentionConfig::TokenRangeRetentionConfig deserializeTokenRangeRetentionConfig(
- std::istream &is,
- static void serialize(
- KvCacheRetentionConfig::TokenRangeRetentionConfig const &tokenRangeRetentionConfig,
- std::ostream &os,
- static size_t serializedSize(
- KvCacheRetentionConfig::TokenRangeRetentionConfig const &tokenRangeRetentionConfig,
-
static DecodingConfig deserializeDecodingConfig(std::istream &is)#
- static void serialize(
- DecodingConfig const &decodingConfig,
- std::ostream &os,
-
static size_t serializedSize(DecodingConfig const &decodingConfig)#
-
static DebugConfig deserializeDebugConfig(std::istream &is)#
- static void serialize(
- DebugConfig const &debugConfig,
- std::ostream &os,
-
static size_t serializedSize(DebugConfig const &debugConfig)#
-
static ExecutorConfig deserializeExecutorConfig(std::istream &is)#
- static void serialize(
- ExecutorConfig const &executorConfig,
- std::ostream &os,
-
static size_t serializedSize(ExecutorConfig const &executorConfig)#
-
static KvCacheStats deserializeKvCacheStats(std::istream &is)#
- static void serialize(
- KvCacheStats const &kvCacheStats,
- std::ostream &os,
-
static size_t serializedSize(KvCacheStats const &kvCacheStats)#
- static StaticBatchingStats deserializeStaticBatchingStats(
- std::istream &is,
- static void serialize(
- StaticBatchingStats const &staticBatchingStats,
- std::ostream &os,
- static size_t serializedSize(
- StaticBatchingStats const &staticBatchingStats,
- static InflightBatchingStats deserializeInflightBatchingStats(
- std::istream &is,
- static void serialize(
- InflightBatchingStats const &inflightBatchingStats,
- std::ostream &os,
- static size_t serializedSize(
- InflightBatchingStats const &inflightBatchingStats,
- static IterationStats deserializeIterationStats(
- std::vector<char> &buffer,
-
static IterationStats deserializeIterationStats(std::istream &is)#
- static void serialize(
- IterationStats const &iterStats,
- std::ostream &os,
-
static std::vector<char> serialize(IterationStats const &iterStats)#
-
static size_t serializedSize(IterationStats const &iterStats)#
- static std::vector<char> serialize(
- std::vector<IterationStats> const &iterStatsVec,
- static std::vector<IterationStats> deserializeIterationStatsVec(
- std::vector<char> &buffer,
- static DisServingRequestStats deserializeDisServingRequestStats(
- std::istream &is,
- static void serialize(
- DisServingRequestStats const &stats,
- std::ostream &os,
- static size_t serializedSize(
- DisServingRequestStats const &disServingRequestStats,
-
static RequestStage deserializeRequestStage(std::istream &is)#
- static void serialize(
- RequestStage const &requestStage,
- std::ostream &os,
-
static size_t serializedSize(RequestStage const &requestStage)#
-
static RequestStats deserializeRequestStats(std::istream &is)#
-
static void serialize(RequestStats const &state, std::ostream &os)#
-
static size_t serializedSize(RequestStats const &state)#
- static RequestStatsPerIteration deserializeRequestStatsPerIteration(
- std::istream &is,
- static RequestStatsPerIteration deserializeRequestStatsPerIteration(
- std::vector<char> &buffer,
- static void serialize(
- RequestStatsPerIteration const &state,
- std::ostream &os,
- static std::vector<char> serialize(
- RequestStatsPerIteration const &state,
-
static size_t serializedSize(RequestStatsPerIteration const &state)#
- static std::vector<char> serialize(
- std::vector<RequestStatsPerIteration> const &requestStatsVec,
- static std::vector<RequestStatsPerIteration> deserializeRequestStatsPerIterationVec(
- std::vector<char> &buffer,
-
static std::string deserializeString(std::istream &is)#
-
static bool deserializeBool(std::istream &is)#
-
namespace kv_cache#
-
class Serialization#
-
namespace executor
tensor.h#
-
namespace tensorrt_llm
-
namespace executor
-
class Shape : public tensorrt_llm::common::ArrayView<detail::DimType64 const>#
Public Types
-
using Base = tensorrt_llm::common::ArrayView<detail::DimType64 const>#
-
using Base = tensorrt_llm::common::ArrayView<detail::DimType64 const>#
-
class Tensor#
Public Types
-
using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>#
Public Functions
-
Tensor copyToCpu(Tensor::CudaStreamPtr stream = nullptr) const#
-
Tensor copyToPinned(Tensor::CudaStreamPtr stream = nullptr) const#
-
Tensor copyToPooledPinned(Tensor::CudaStreamPtr stream = nullptr) const#
-
Tensor copyToManaged(Tensor::CudaStreamPtr stream = nullptr) const#
-
Tensor copyToGpu(Tensor::CudaStreamPtr stream) const#
-
Tensor() noexcept = default#
-
~Tensor() = default#
-
void *getData()#
Returns a pointer to underlying array.
-
void const *getData() const#
Returns a pointer to underlying array.
-
MemoryType getMemoryType() const#
Returns the memory type of the buffer.
-
std::size_t getSize() const#
Returns the number of elements in the tensor.
-
std::size_t getSizeInBytes() const#
Returns the size of the tensor in bytes.
-
void setZero(CudaStreamPtr stream = nullptr)#
Set the entire memory to zero.
- Parameters:
stream – Must be a valid CUDA stream if the memory type is GPU.
-
void setFrom(Tensor const &other, CudaStreamPtr stream = nullptr)#
Copy the data and shape from another tensor.
- Parameters:
other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.
-
inline explicit operator bool() const#
Public Static Functions
-
static Tensor cpu(DataType dataType, Shape shape = {})#
Allocate a cpu tensor with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
-
static Tensor pinned(DataType dataType, Shape shape = {})#
Allocate a cpu tensor in pinned memory with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
-
static Tensor pooledPinned(DataType dataType, Shape shape = {})#
Allocate a cpu tensor in pooled pinned memory with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
-
static Tensor managed(DataType dataType, Shape shape = {})#
Allocate a tensor in managed memory (UVM) with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
- static Tensor gpu(
- DataType dataType,
- CudaStreamPtr stream,
- Shape shape = {},
Allocate a gpu tensor with the given shape and data type on a particular cuda stream.
- Parameters:
shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.
-
template<typename T>
static inline Tensor gpu( - CudaStreamPtr stream,
- Shape shape = {},
-
static Tensor of(DataType dataType, void *data, Shape shape)#
Wrap a data pointer into a tensor without taking ownership.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
-
using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>#
-
class Shape : public tensorrt_llm::common::ArrayView<detail::DimType64 const>#
-
namespace runtime
-
namespace executor