Executor
executor.h
-
namespace tensorrt_llm
-
namespace executor
Functions
-
char const *version() noexcept
Version of TRT-LLM.
Variables
-
SizeType32 const kDefaultIterStatsMaxIterations = 1000
-
SizeType32 const kDefaultRequestStatsMaxIterations = 0
-
class SamplingConfig
- #include <executor.h>
Sampling configuration.
Public Functions
-
explicit SamplingConfig(SizeType32 beamWidth = 1, std::optional<SizeType32> const &topK = std::nullopt, std::optional<FloatType> const &topP = std::nullopt, std::optional<FloatType> const &topPMin = std::nullopt, std::optional<TokenIdType> const &topPResetIds = std::nullopt, std::optional<FloatType> const &topPDecay = std::nullopt, std::optional<RandomSeedType> const &seed = std::nullopt, std::optional<FloatType> const &temperature = std::nullopt, std::optional<SizeType32> const &minTokens = std::nullopt, std::optional<FloatType> const &beamSearchDiversityRate = std::nullopt, std::optional<FloatType> const &repetitionPenalty = std::nullopt, std::optional<FloatType> const &presencePenalty = std::nullopt, std::optional<FloatType> const &frequencyPenalty = std::nullopt, std::optional<FloatType> const &lengthPenalty = std::nullopt, std::optional<SizeType32> const &earlyStopping = std::nullopt, std::optional<SizeType32> const &noRepeatNgramSize = std::nullopt)
Constructor for SamplingConfig See description of parameters below.
-
bool operator==(SamplingConfig const &other) const
-
SizeType32 getBeamWidth() const
-
std::optional<SizeType32> getTopK() const
-
std::optional<SizeType32> getTopPResetIds() const
-
std::optional<RandomSeedType> getSeed() const
-
std::optional<RandomSeedType> getRandomSeed() const
-
std::optional<SizeType32> getMinTokens() const
-
std::optional<SizeType32> getMinLength() const
-
std::optional<SizeType32> getEarlyStopping() const
-
std::optional<SizeType32> getNoRepeatNgramSize() const
-
void setBeamWidth(SizeType32 beamWidth)
-
void setTopK(std::optional<SizeType32> const &topK)
-
void setTopPResetIds(std::optional<TokenIdType> const &topPResetIds)
-
void setSeed(std::optional<RandomSeedType> const &seed)
-
void setRandomSeed(std::optional<RandomSeedType> const &randomSeed)
-
void setMinTokens(std::optional<SizeType32> const &minTokens)
-
void setMinLength(std::optional<SizeType32> const &minLength)
-
void setEarlyStopping(std::optional<SizeType32> const &earlyStopping)
-
void setNoRepeatNgramSize(std::optional<SizeType32> const &noRepeatNgramSize)
Private Members
-
SizeType32 mBeamWidth
The beam width. Default is 1 which disables beam search.
-
std::optional<SizeType32> mTopK
Controls number of logits to sample from. Default is 0 (all logits).
-
std::optional<FloatType> mTopPMin
Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6.
-
std::optional<TokenIdType> mTopPResetIds
Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1.
-
std::optional<FloatType> mTopPDecay
Controls decay in the top-P algorithm. The decay value. Default is 1.f.
-
std::optional<RandomSeedType> mSeed
Controls the random seed used by the random number generator in sampling.
-
std::optional<FloatType> mTemperature
Controls the modulation of logits when sampling new tokens. It can have values > 0.f. Default is 1.0f.
-
std::optional<SizeType32> mMinTokens
Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1.
-
std::optional<FloatType> mRepetitionPenalty
Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. Default is 1.f.
-
std::optional<FloatType> mPresencePenalty
Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.
-
std::optional<FloatType> mFrequencyPenalty
Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.
-
std::optional<FloatType> mLengthPenalty
Controls how to penalize longer sequences in beam search. Default is 0.f.
-
std::optional<SizeType32> mEarlyStopping
Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token)
-
std::optional<SizeType32> mNoRepeatNgramSize
Controls how many repeat ngram size are acceptable. Default is 1 << 30.
Private Static Functions
-
static SizeType32 checkBeamWidth(SizeType32 beamWidth)
-
static std::optional<TokenIdType> const &checkTopPResetIds(std::optional<TokenIdType> const &topPResetIds)
-
static std::optional<FloatType> const &checkTemperature(std::optional<FloatType> const &temperature)
-
static std::optional<FloatType> const &checkRepetitionPenalty(std::optional<FloatType> const &penalty)
-
static std::optional<SizeType32> const &checkMinTokens(std::optional<SizeType32> const &minTokens)
-
static std::optional<SizeType32> const &checkNoRepeatNgramSize(std::optional<SizeType32> const &noRepeatNgramSize)
Friends
- friend class Serialization
-
explicit SamplingConfig(SizeType32 beamWidth = 1, std::optional<SizeType32> const &topK = std::nullopt, std::optional<FloatType> const &topP = std::nullopt, std::optional<FloatType> const &topPMin = std::nullopt, std::optional<TokenIdType> const &topPResetIds = std::nullopt, std::optional<FloatType> const &topPDecay = std::nullopt, std::optional<RandomSeedType> const &seed = std::nullopt, std::optional<FloatType> const &temperature = std::nullopt, std::optional<SizeType32> const &minTokens = std::nullopt, std::optional<FloatType> const &beamSearchDiversityRate = std::nullopt, std::optional<FloatType> const &repetitionPenalty = std::nullopt, std::optional<FloatType> const &presencePenalty = std::nullopt, std::optional<FloatType> const &frequencyPenalty = std::nullopt, std::optional<FloatType> const &lengthPenalty = std::nullopt, std::optional<SizeType32> const &earlyStopping = std::nullopt, std::optional<SizeType32> const &noRepeatNgramSize = std::nullopt)
-
class OutputConfig
- #include <executor.h>
Configuration that controls the outputs of a Result.
Public Functions
-
explicit OutputConfig(bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false, bool excludeInputFromOutput = false, bool returnEncoderOutput = false)
-
explicit OutputConfig(bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false, bool excludeInputFromOutput = false, bool returnEncoderOutput = false)
-
class ExternalDraftTokensConfig
- #include <executor.h>
Configuration for speculative decoding with external draft tokens. Allows to include draft tokens, draft logits and specify acceptance threshold.
Public Functions
Private Members
Friends
- friend class Serialization
-
class PromptTuningConfig
- #include <executor.h>
Configuration for prompt tuning.
Public Functions
-
explicit PromptTuningConfig(Tensor embeddingTable, std::optional<VecTokenExtraIds> inputTokenExtraIds = std::nullopt)
-
std::optional<VecTokenExtraIds> getInputTokenExtraIds() const
Private Members
-
Tensor mEmbeddingTable
The prompt embedding table. Expected shape: [task vocab_size, hidden_size]. Data type must match model weights.
-
std::optional<VecTokenExtraIds> mInputTokenExtraIds
The input token extra ids for KV Cache reuse when p-tuning is enabled.
Friends
- friend class Serialization
-
explicit PromptTuningConfig(Tensor embeddingTable, std::optional<VecTokenExtraIds> inputTokenExtraIds = std::nullopt)
-
class LoraConfig
- #include <executor.h>
Configuration for LoRA.
Public Functions
Private Members
Friends
- friend class Serialization
-
struct LookaheadDecodingConfig
Public Functions
-
LookaheadDecodingConfig(SizeType32 windowSize, SizeType32 ngramSize, SizeType32 verificationSetSize)
-
inline explicit LookaheadDecodingConfig()
-
bool operator==(LookaheadDecodingConfig const &other) const
-
std::tuple<SizeType32 const, SizeType32 const, SizeType32 const> get() const
-
SizeType32 getWindowSize() const
-
SizeType32 getNgramSize() const
-
SizeType32 getVerificationSetSize() const
-
std::tuple<SizeType32, SizeType32, SizeType32, SizeType32> calculateSpeculativeResource() const
return <maxDecodingTokens, maxPathLen, maxDraftTokens, maxDraftPathLen>
-
bool isLE(LookaheadDecodingConfig const &that) const
return true when
this
can be executed on resources defined bythat
Public Static Functions
-
static bool isLegal(SizeType32 windowSize, SizeType32 ngramSize, SizeType32 verificationSetSize) noexcept
return true when the parameter combination is valid.
Friends
- friend class Serialization
-
LookaheadDecodingConfig(SizeType32 windowSize, SizeType32 ngramSize, SizeType32 verificationSetSize)
-
class ContextPhaseParams
Public Functions
-
ContextPhaseParams(ContextPhaseParams const&)
-
ContextPhaseParams(ContextPhaseParams&&)
-
ContextPhaseParams &operator=(ContextPhaseParams const&)
-
ContextPhaseParams &operator=(ContextPhaseParams&&)
-
bool operator==(ContextPhaseParams const&) const noexcept
-
void const *getState() const noexcept
-
void *getState() noexcept
-
void *releaseState() noexcept
Private Members
Private Static Functions
-
static void deleter(void const *data)
Friends
- friend class Serialization
-
ContextPhaseParams(ContextPhaseParams const&)
-
class Request
- #include <executor.h>
A class that holds information about the request.
Public Functions
-
Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming = false, SamplingConfig const &samplingConfig = SamplingConfig(), OutputConfig const &outputConfig = OutputConfig(), std::optional<SizeType32> const &endId = std::nullopt, std::optional<SizeType32> const &padId = std::nullopt, std::optional<std::vector<SizeType32>> positionIds = std::nullopt, std::optional<std::list<VecTokens>> badWords = std::nullopt, std::optional<std::list<VecTokens>> stopWords = std::nullopt, std::optional<Tensor> embeddingBias = std::nullopt, std::optional<ExternalDraftTokensConfig> externalDraftTokensConfig = std::nullopt, std::optional<PromptTuningConfig> pTuningConfig = std::nullopt, std::optional<LoraConfig> loraConfig = std::nullopt, std::optional<LookaheadDecodingConfig> lookaheadConfig = std::nullopt, std::optional<std::string> logitsPostProcessorName = std::nullopt, std::optional<VecTokens> encoderInputTokenIds = std::nullopt, std::optional<IdType> clientId = std::nullopt, bool returnAllGeneratedTokens = false, PriorityType priority = kDefaultPriority, RequestType type = RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION, std::optional<ContextPhaseParams> contextPhaseParams = std::nullopt, std::optional<Tensor> encoderInputFeatures = std::nullopt, std::optional<SizeType32> encoderOutputLength = std::nullopt, SizeType32 numReturnSequences = 1)
The Request constructor.
- Parameters:
inputTokenIds – The input token ids
maxTokens – The maximum number of tokens to generate
streaming – Indicates if the responses should be streamed or not. Default is false.
samplingConfig – The sampling configuration
outputConfig – The output configuration
endId – The end token id
padId – The pad token id
positionIds – The input position ids
badWords – A list of bad words tokens. Each “word” can be composed of multiple tokens
stopWords – A list of stop words tokens. Each “word” can be composed of multiple tokens
embeddingBias – The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]
externalDraftTokensConfig – The speculative decoding configuration
pTuningConfig – The prompt tuning configuration
loraConfig – The LoRA configuration
logitsPostProcessorName – The logits postprocessor name. Must correspond to one of the logits postprocessor name provided to the ExecutorConfig.
encoderInputTokenIds – The encoder input token ids for encoder-decoder models, or encoder-only models
returnAllGeneratedTokens – Indicates whether to return the full beams or just the newly generated tokens after every streaming step.
priority – Sets the execution priority of this request.
encoderInputFeatures – Encoder input features for multimodal models.
encoderOutputLength – Encoder output length if encoder input and output have different lengths (due to convolution down-sampling, etc.)
type – Indicate the request type for disaggregated serving mode.
contextPhaseParams – Generated token ID from context only executor.
numReturnSequences – The number of returning sequences.
-
~Request()
-
SizeType32 getMaxTokens() const
-
SizeType32 getMaxNewTokens() const
-
bool getStreaming() const
-
SamplingConfig getSamplingConfig() const
-
OutputConfig getOutputConfig() const
-
std::optional<SizeType32> getEndId() const
-
std::optional<SizeType32> getPadId() const
-
std::optional<std::vector<SizeType32>> getPositionIds() const
-
std::optional<ExternalDraftTokensConfig> getExternalDraftTokensConfig() const
-
std::optional<PromptTuningConfig> getPromptTuningConfig() const
-
std::optional<LoraConfig> getLoraConfig() const
-
std::optional<LookaheadDecodingConfig> getLookaheadConfig() const
-
std::optional<std::string> getLogitsPostProcessorName() const
-
PriorityType getPriority() const
-
bool getReturnAllGeneratedTokens() const
-
std::optional<ContextPhaseParams> const &getContextPhaseParams() const
-
std::optional<SizeType32> getEncoderOutputLength() const
-
RequestType getRequestType() const
-
SizeType32 getNumReturnSequences() const
-
void setStreaming(bool streaming)
-
void setSamplingConfig(SamplingConfig const &config)
-
void setOutputConfig(OutputConfig const &outputConfig)
-
void setEndId(SizeType32 endId)
-
void setPadId(SizeType32 padId)
-
void setPositionIds(std::vector<SizeType32> const &positionIds)
-
void setExternalDraftTokensConfig(ExternalDraftTokensConfig const &externalDraftTokensConfig)
-
void setPromptTuningConfig(PromptTuningConfig const &pTuningConfig)
-
void setLoraConfig(LoraConfig const &loraConfig)
-
void setLookaheadConfig(LookaheadDecodingConfig const &lookaheadConfig)
-
void setLogitsPostProcessorName(std::string const &logitsPostProcessorName)
-
void setPriority(PriorityType priority)
-
void setReturnAllGeneratedTokens(bool returnAllGeneratedTokens)
-
void setRequestType(RequestType const &requestType)
-
void setContextPhaseParams(ContextPhaseParams contextPhaseParams)
-
void setEncoderOutputLength(SizeType32 encoderOutputLength)
-
void setNumReturnSequences(SizeType32 numReturnSequences)
Public Static Attributes
-
static constexpr PriorityType kDefaultPriority = 0.5
-
static constexpr auto kBatchedPostProcessorName = "batched"
This logits postprocessor name will dispatch to the batched logits postprocessor.
Private Members
-
std::unique_ptr<Impl> mImpl
Friends
- friend class Serialization
-
Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming = false, SamplingConfig const &samplingConfig = SamplingConfig(), OutputConfig const &outputConfig = OutputConfig(), std::optional<SizeType32> const &endId = std::nullopt, std::optional<SizeType32> const &padId = std::nullopt, std::optional<std::vector<SizeType32>> positionIds = std::nullopt, std::optional<std::list<VecTokens>> badWords = std::nullopt, std::optional<std::list<VecTokens>> stopWords = std::nullopt, std::optional<Tensor> embeddingBias = std::nullopt, std::optional<ExternalDraftTokensConfig> externalDraftTokensConfig = std::nullopt, std::optional<PromptTuningConfig> pTuningConfig = std::nullopt, std::optional<LoraConfig> loraConfig = std::nullopt, std::optional<LookaheadDecodingConfig> lookaheadConfig = std::nullopt, std::optional<std::string> logitsPostProcessorName = std::nullopt, std::optional<VecTokens> encoderInputTokenIds = std::nullopt, std::optional<IdType> clientId = std::nullopt, bool returnAllGeneratedTokens = false, PriorityType priority = kDefaultPriority, RequestType type = RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION, std::optional<ContextPhaseParams> contextPhaseParams = std::nullopt, std::optional<Tensor> encoderInputFeatures = std::nullopt, std::optional<SizeType32> encoderOutputLength = std::nullopt, SizeType32 numReturnSequences = 1)
-
struct Result
- #include <executor.h>
Struct that holds the generation result.
Public Members
-
bool isFinal
Indicates if this is the final result for the request.
-
BeamTokens outputTokenIds
The output tokens for each beam.
-
std::optional<VecLogProbs> cumLogProbs
The cumulative log probabilities. Size beamSize.
-
std::optional<std::vector<VecLogProbs>> logProbs
The log probabilities for each generated token. Size [beamSize, outputLen].
-
std::optional<Tensor> generationLogits
The context logits. Size [beamSize, maxNewTokens, vocabSizePadded] (non-streaming) or [maxNewTokens, beamSize, vocabSizePadded] (streaming and allGeneratedTokens) or [1, beamSize, vocabSizePadded] (streaming and non-allGeneratedTokens)
-
std::vector<FinishReason> finishReasons
The reason why the model stopped generating tokens for each beam in this request. Size [beamSize]. Currently only supported when beamSize is 1 and when using BatchingType::kINFLIGHT.
-
std::optional<ContextPhaseParams> contextPhaseParams
The params of the context phase.
-
SizeType32 decodingIter = {0}
The decoding iterations it takes.
-
SizeType32 sequenceIndex = {0}
The index of the output sequence where 0 <= sequenceIndex < numReturnSequences.
-
bool isSequenceFinal
Indicates if this is the final result for a given sequence in the request.
-
bool isFinal
-
class Response
- #include <executor.h>
Class that holds either an error or a result.
Public Functions
-
~Response()
-
bool hasError() const
Indicates if this response has an error or not.
-
std::string const &getErrorMsg() const
Get the error msg for this response Will throw an exception if hasError is false.
Private Members
-
std::unique_ptr<Impl> mImpl
Friends
- friend class Serialization
-
~Response()
-
class SchedulerConfig
- #include <executor.h>
Configuration class for the scheduler.
Public Functions
-
explicit SchedulerConfig(CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, std::optional<ContextChunkingPolicy> contextChunkingPolicy = std::nullopt)
-
bool operator==(SchedulerConfig const &other) const
-
CapacitySchedulerPolicy getCapacitySchedulerPolicy() const
-
std::optional<ContextChunkingPolicy> getContextChunkingPolicy() const
Private Members
-
CapacitySchedulerPolicy mCapacitySchedulerPolicy
The capacity scheduler policy. See CapacitySchedulerPolicy.
-
std::optional<ContextChunkingPolicy> mContextChunkingPolicy
The context chunking policy. See ContextChunkingPolicy.
Friends
- friend class Serialization
-
explicit SchedulerConfig(CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, std::optional<ContextChunkingPolicy> contextChunkingPolicy = std::nullopt)
-
class KvCacheConfig
- #include <executor.h>
Configuration class for the KV cache.
Public Functions
-
explicit KvCacheConfig(bool enableBlockReuse = false, std::optional<SizeType32> const &maxTokens = std::nullopt, std::optional<std::vector<SizeType32>> const &maxAttentionWindowVec = std::nullopt, std::optional<SizeType32> const &sinkTokenLength = std::nullopt, std::optional<FloatType> const &freeGpuMemoryFraction = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt, bool onboardBlocks = true)
-
bool getEnableBlockReuse() const
-
std::optional<SizeType32> getMaxTokens() const
-
std::optional<std::vector<SizeType32>> getMaxAttentionWindowVec() const
-
std::optional<SizeType32> getSinkTokenLength() const
-
std::optional<size_t> getHostCacheSize() const
-
bool getOnboardBlocks() const
-
void setEnableBlockReuse(bool enableBlockReuse)
-
void setMaxTokens(SizeType32 maxTokens)
-
void setMaxAttentionWindowVec(std::vector<SizeType32> maxAttentionWindowVec)
-
void setSinkTokenLength(SizeType32 sinkTokenLength)
-
void setHostCacheSize(size_t hostCacheSize)
-
void setOnboardBlocks(bool onboardBlocks)
Private Members
-
bool mEnableBlockReuse
Controls if KV cache blocks can be reused for different requests.
-
std::optional<SizeType32> mMaxTokens
The maximum number of tokens that should be stored in the KV cache If both mMaxTokens and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.
-
std::optional<std::vector<SizeType32>> mMaxAttentionWindowVec
Size of the attention window for each sequence. Only the last mMaxAttentionWindow tokens of each sequence will be stored in the KV cache. Different layers may have different max attention window sizes. If the number of elements in mMaxAttentionWindowVec is less than the number of layers, mMaxAttentionWindowVec will be repeated multiple times to the number of layers.
-
std::optional<SizeType32> mSinkTokenLength
Number of sink tokens (tokens to always keep in attention window)
-
std::optional<FloatType> mFreeGpuMemoryFraction
The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both mMaxTokens and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.
-
std::optional<size_t> mHostCacheSize
Size of secondary memory pool in bytes. Default is 0. Having a secondary memory pool increases KV cache block reuse potential.
-
bool mOnboardBlocks
Controls whether offloaded blocks should be onboarded back into primary memory before being reused.
Friends
- friend class Serialization
-
explicit KvCacheConfig(bool enableBlockReuse = false, std::optional<SizeType32> const &maxTokens = std::nullopt, std::optional<std::vector<SizeType32>> const &maxAttentionWindowVec = std::nullopt, std::optional<SizeType32> const &sinkTokenLength = std::nullopt, std::optional<FloatType> const &freeGpuMemoryFraction = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt, bool onboardBlocks = true)
-
class ExtendedRuntimePerfKnobConfig
- #include <executor.h>
Configuration class for the runtime perf knobs.
Public Functions
-
explicit ExtendedRuntimePerfKnobConfig(bool multiBlockMode = true, bool enableContextFMHAFP32Acc = false)
-
inline bool operator==(ExtendedRuntimePerfKnobConfig const &other) const
-
bool getMultiBlockMode() const
-
bool getEnableContextFMHAFP32Acc() const
-
void setMultiBlockMode(bool multiBlockMode)
-
void setEnableContextFMHAFP32Acc(bool enableContextFMHAFP32Acc)
Private Members
-
bool mMultiBlockMode
Control if multi block mode should be enabled or not.
-
bool mEnableContextFMHAFP32Acc
If enable FMHA runner FP32 accumulation.
Friends
- friend class Serialization
-
explicit ExtendedRuntimePerfKnobConfig(bool multiBlockMode = true, bool enableContextFMHAFP32Acc = false)
-
class DebugConfig
- #include <executor.h>
Configuration class for debugging output.
Public Functions
-
explicit DebugConfig(bool dumpInputTensors = false, bool dumpOuputTensors = false, StringVec debugTensorNames = {})
-
bool operator==(DebugConfig const &other) const
-
bool getDumpInputTensors() const
-
bool getDumpOutputTensors() const
-
void setDumpInputTensors(bool dumpInputTensors)
-
void setDumpOuputTensors(bool dumpOuputTensors)
Private Types
-
using StringVec = std::vector<std::string>
Private Members
-
bool mDumpInputTensors
If true, dump all input tensors.
-
bool mDumpOuputTensors
If true, dump all output tensors.
Friends
- friend class Serialization
-
explicit DebugConfig(bool dumpInputTensors = false, bool dumpOuputTensors = false, StringVec debugTensorNames = {})
-
class OrchestratorConfig
-
class ParallelConfig
- #include <executor.h>
A configuration class for the parallel execution parameters Currently only supports commType = CommunicationType::kMPI.
Public Functions
-
explicit ParallelConfig(CommunicationType commType = CommunicationType::kMPI, CommunicationMode commMode = CommunicationMode::kLEADER, std::optional<std::vector<SizeType32>> deviceIds = std::nullopt, std::optional<std::vector<SizeType32>> participantIds = std::nullopt, std::optional<OrchestratorConfig> const &orchestratorConfig = std::nullopt)
Constructor.
- Parameters:
commType – The communication type. See CommunicationType.
commMode – The communication mode. See CommunicationMode.
deviceIds – The IDs of the GPUs involved in the execution of the model
participantIds – The participant IDs (MPI ranks if commType == kMPI) involved in the execution of the model. The first participant is considered to be the leader.
-
CommunicationType getCommunicationType() const
-
CommunicationMode getCommunicationMode() const
-
std::optional<std::vector<SizeType32>> getDeviceIds() const
-
std::optional<std::vector<SizeType32>> getParticipantIds() const
-
std::optional<OrchestratorConfig> getOrchestratorConfig() const
-
void setCommunicationType(CommunicationType type)
-
void setCommunicationMode(CommunicationMode mode)
-
void setDeviceIds(std::vector<SizeType32> const &deviceIds)
-
void setParticipantIds(std::vector<SizeType32> const &participantIds)
-
void setOrchestratorConfig(OrchestratorConfig const &orchestratorConfig)
Private Members
-
CommunicationType mCommType
The type of communication protocol used. Default is MPI.
-
CommunicationMode mCommMode
The mode of communication. See CommunicationMode.
-
std::optional<std::vector<SizeType32>> mDeviceIds
The GPU device ids to use for executing this model.
-
std::optional<std::vector<SizeType32>> mParticipantIds
The participant ids (MPI ranks for example) used for executing this model.
-
std::optional<OrchestratorConfig> mOrchestratorConfig
Optional orchestrator configuration.
Friends
- friend class Serialization
-
explicit ParallelConfig(CommunicationType commType = CommunicationType::kMPI, CommunicationMode commMode = CommunicationMode::kLEADER, std::optional<std::vector<SizeType32>> deviceIds = std::nullopt, std::optional<std::vector<SizeType32>> participantIds = std::nullopt, std::optional<OrchestratorConfig> const &orchestratorConfig = std::nullopt)
-
class PeftCacheConfig
- #include <executor.h>
config for PeftCacheManager
Public Functions
-
explicit PeftCacheConfig(SizeType32 numHostModuleLayer = 0, SizeType32 numDeviceModuleLayer = 0, SizeType32 optimalAdapterSize = 8, SizeType32 maxAdapterSize = 64, SizeType32 numPutWorkers = 1, SizeType32 numEnsureWorkers = 1, SizeType32 numCopyStreams = 1, SizeType32 maxPagesPerBlockHost = 24, SizeType32 maxPagesPerBlockDevice = 8, std::optional<float> const &deviceCachePercent = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt)
-
bool operator==(PeftCacheConfig const &other) const
-
SizeType32 getNumHostModuleLayer() const
-
SizeType32 getNumDeviceModuleLayer() const
-
SizeType32 getOptimalAdapterSize() const
-
SizeType32 getMaxAdapterSize() const
-
SizeType32 getNumPutWorkers() const
-
SizeType32 getNumEnsureWorkers() const
-
SizeType32 getNumCopyStreams() const
-
SizeType32 getMaxPagesPerBlockHost() const
-
SizeType32 getMaxPagesPerBlockDevice() const
-
std::optional<float> getDeviceCachePercent() const
-
std::optional<size_t> getHostCacheSize() const
Private Members
-
SizeType32 mNumHostModuleLayer
-
SizeType32 mNumDeviceModuleLayer
-
SizeType32 mOptimalAdapterSize
-
SizeType32 mMaxAdapterSize
-
SizeType32 mNumPutWorkers
-
SizeType32 mNumEnsureWorkers
-
SizeType32 mNumCopyStreams
-
SizeType32 mMaxPagesPerBlockHost
-
SizeType32 mMaxPagesPerBlockDevice
-
std::optional<size_t> mHostCacheSize
Friends
- friend class Serialization
-
explicit PeftCacheConfig(SizeType32 numHostModuleLayer = 0, SizeType32 numDeviceModuleLayer = 0, SizeType32 optimalAdapterSize = 8, SizeType32 maxAdapterSize = 64, SizeType32 numPutWorkers = 1, SizeType32 numEnsureWorkers = 1, SizeType32 numCopyStreams = 1, SizeType32 maxPagesPerBlockHost = 24, SizeType32 maxPagesPerBlockDevice = 8, std::optional<float> const &deviceCachePercent = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt)
-
class DecodingConfig
- #include <executor.h>
Configuration class for the decoding.
Public Functions
-
explicit DecodingConfig(std::optional<DecodingMode> decodingMode = std::nullopt, std::optional<LookaheadDecodingConfig> lookaheadDecodingConfig = std::nullopt, std::optional<MedusaChoices> medusaChoices = std::nullopt)
-
bool operator==(DecodingConfig const &other) const
-
void setDecodingMode(DecodingMode const&)
Sets decoding mode. Some modes require the use of their own setters.
-
std::optional<DecodingMode> getDecodingMode() const
-
void setLookaheadDecoding(LookaheadDecodingConfig const &lookaheadDecodingConfig)
Sets lookahead decoding mode and config.
-
std::optional<LookaheadDecodingConfig> getLookaheadDecodingConfig() const
-
void setMedusaChoices(MedusaChoices const&)
Sets medusa mode and config.
-
std::optional<MedusaChoices> getMedusaChoices() const
Private Members
-
std::optional<DecodingMode> mDecodingMode
-
std::optional<LookaheadDecodingConfig> mLookaheadDecodingConfig
-
std::optional<MedusaChoices> mMedusaChoices
Friends
- friend class Serialization
-
explicit DecodingConfig(std::optional<DecodingMode> decodingMode = std::nullopt, std::optional<LookaheadDecodingConfig> lookaheadDecodingConfig = std::nullopt, std::optional<MedusaChoices> medusaChoices = std::nullopt)
-
class LogitsPostProcessorConfig
Public Functions
-
explicit LogitsPostProcessorConfig(std::optional<LogitsPostProcessorMap> processorMap = std::nullopt, std::optional<LogitsPostProcessorBatched> processorBatched = std::nullopt, bool replicate = true)
-
std::optional<LogitsPostProcessorMap> getProcessorMap() const
-
std::optional<LogitsPostProcessorBatched> getProcessorBatched() const
-
bool getReplicate() const
-
void setProcessorMap(LogitsPostProcessorMap const &processorMap)
-
void setProcessorBatched(LogitsPostProcessorBatched const &processorBatched)
-
void setReplicate(bool replicate)
Private Members
-
std::optional<LogitsPostProcessorMap> mProcessorMap
mapping from post processor names to non-batched post processors
-
std::optional<LogitsPostProcessorBatched> mProcessorBatched
single batched post processor
-
bool mReplicate
If set to true, logits post processor will run on all TP ranks in last PP rank.
-
explicit LogitsPostProcessorConfig(std::optional<LogitsPostProcessorMap> processorMap = std::nullopt, std::optional<LogitsPostProcessorBatched> processorBatched = std::nullopt, bool replicate = true)
-
class ExecutorConfig
- #include <executor.h>
Configuration class for the model executor.
Public Functions
-
explicit ExecutorConfig(SizeType32 maxBeamWidth = 1, SchedulerConfig const &schedulerConfig = SchedulerConfig(), KvCacheConfig const &kvCacheConfig = KvCacheConfig(), bool enableChunkedContext = false, bool normalizeLogProbs = true, SizeType32 iterStatsMaxIterations = kDefaultIterStatsMaxIterations, SizeType32 requestStatsMaxIterations = kDefaultRequestStatsMaxIterations, BatchingType batchingType = BatchingType::kINFLIGHT, std::optional<SizeType32> maxBatchSize = std::nullopt, std::optional<SizeType32> maxNumTokens = std::nullopt, std::optional<ParallelConfig> parallelConfig = std::nullopt, std::optional<PeftCacheConfig> const &peftCacheConfig = std::nullopt, std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig = std::nullopt, std::optional<DecodingConfig> decodingConfig = std::nullopt, float gpuWeightsPercent = 1, std::optional<SizeType32> maxQueueSize = std::nullopt, ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig(), std::optional<DebugConfig> debugConfig = std::nullopt, SizeType32 recvPollPeriodMs = 0, uint64_t maxSeqIdleMicroseconds = 180000000)
-
SizeType32 getMaxBeamWidth() const
-
SchedulerConfig getSchedulerConfig() const
-
KvCacheConfig getKvCacheConfig() const
-
bool getEnableChunkedContext() const
-
bool getNormalizeLogProbs() const
-
SizeType32 getIterStatsMaxIterations() const
-
SizeType32 getRequestStatsMaxIterations() const
-
BatchingType getBatchingType() const
-
std::optional<SizeType32> getMaxBatchSize() const
-
std::optional<SizeType32> getMaxNumTokens() const
-
std::optional<ParallelConfig> getParallelConfig() const
-
std::optional<PeftCacheConfig> getPeftCacheConfig() const
-
std::optional<LogitsPostProcessorConfig> getLogitsPostProcessorConfig() const
-
std::optional<DecodingConfig> getDecodingConfig() const
-
float getGpuWeightsPercent() const
-
std::optional<SizeType32> getMaxQueueSize() const
-
ExtendedRuntimePerfKnobConfig getExtendedRuntimePerfKnobConfig() const
-
std::optional<DebugConfig> getDebugConfig() const
-
SizeType32 getRecvPollPeriodMs() const
-
uint64_t getMaxSeqIdleMicroseconds() const
-
void setMaxBeamWidth(SizeType32 maxBeamWidth)
-
void setMaxBatchSize(SizeType32 maxBatchSize)
-
void setMaxNumTokens(SizeType32 maxNumTokens)
-
void setSchedulerConfig(SchedulerConfig const &schedulerConfig)
-
void setKvCacheConfig(KvCacheConfig const &kvCacheConfig)
-
void setEnableChunkedContext(bool enableChunkedContext)
-
void setNormalizeLogProbs(bool normalizeLogProbs)
-
void setIterStatsMaxIterations(SizeType32 iterStatsMaxIterations)
-
void setRequestStatsMaxIterations(SizeType32 requestStatsMaxIterations)
-
void setBatchingType(BatchingType batchingType)
-
void setParallelConfig(ParallelConfig const ¶llelConfig)
-
void setPeftCacheConfig(PeftCacheConfig const &peftCacheConfig)
-
void setLogitsPostProcessorConfig(LogitsPostProcessorConfig const &logitsPostProcessorConfig)
-
void setDecodingConfig(DecodingConfig const &decodingConfig)
-
void setGpuWeightsPercent(float const &gpuWeightsPercent)
-
void setMaxQueueSize(std::optional<SizeType32> const &maxQueueSize)
-
void setExtendedRuntimePerfKnobConfig(ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig)
-
void setDebugConfig(DebugConfig const &debugConfig)
-
void setRecvPollPeriodMs(SizeType32 const &recvPollPeriodMs)
-
void setMaxSeqIdleMicroseconds(uint64_t maxNumTokens)
Private Members
-
SizeType32 mMaxBeamWidth
The beam width value of requests that will be sent to the executor.
-
SchedulerConfig mSchedulerConfig
The scheduler configuration.
-
KvCacheConfig mKvCacheConfig
The KV cache configuration.
-
bool mEnableChunkedContext
The KV cache configuration.
-
bool mNormalizeLogProbs
Controls if log probabilities should be normalized or not.
-
SizeType32 mIterStatsMaxIterations
Controls the maximum number of iterations for which to keep statistics.
-
SizeType32 mRequestStatsMaxIterations
Controls the maximum number of iterations for which to keep per-request statistics.
-
BatchingType mBatchingType
The type of batching strategy to use. See BatchingType.
-
std::optional<SizeType32> mMaxBatchSize
The max batch size of requests.
-
std::optional<SizeType32> mMaxNumTokens
The max number of tokens per batch.
-
std::optional<ParallelConfig> mParallelConfig
The parallel execution configuration.
-
std::optional<PeftCacheConfig> mPeftCacheConfig
-
std::optional<LogitsPostProcessorConfig> mLogitsPostProcessorConfig
Logits post processor configuration.
-
std::optional<DecodingConfig> mDecodingConfig
Decoding configuration.
-
float mGpuWeightsPercent
GPU weights percent for weight streaming.
-
std::optional<SizeType32> mMaxQueueSize
The maximum number of requests allowed in queue before rejecting new requests.
-
ExtendedRuntimePerfKnobConfig mExtendedRuntimePerfKnobConfig
Config for perf knobs that can be set in runtime.
-
std::optional<DebugConfig> mDebugConfig
Debugging configuration.
-
SizeType32 mRecvPollPeriodMs
The time in ms between polls for new communication in orchestrator mode. Use 0 for busy loop.
-
uint64_t mMaxSeqIdleMicroseconds
The maximum time in microseconds a scheduled request can remain idle before getting terminated. Default is 3 minutes.
Friends
- friend class Serialization
-
explicit ExecutorConfig(SizeType32 maxBeamWidth = 1, SchedulerConfig const &schedulerConfig = SchedulerConfig(), KvCacheConfig const &kvCacheConfig = KvCacheConfig(), bool enableChunkedContext = false, bool normalizeLogProbs = true, SizeType32 iterStatsMaxIterations = kDefaultIterStatsMaxIterations, SizeType32 requestStatsMaxIterations = kDefaultRequestStatsMaxIterations, BatchingType batchingType = BatchingType::kINFLIGHT, std::optional<SizeType32> maxBatchSize = std::nullopt, std::optional<SizeType32> maxNumTokens = std::nullopt, std::optional<ParallelConfig> parallelConfig = std::nullopt, std::optional<PeftCacheConfig> const &peftCacheConfig = std::nullopt, std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig = std::nullopt, std::optional<DecodingConfig> decodingConfig = std::nullopt, float gpuWeightsPercent = 1, std::optional<SizeType32> maxQueueSize = std::nullopt, ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig(), std::optional<DebugConfig> debugConfig = std::nullopt, SizeType32 recvPollPeriodMs = 0, uint64_t maxSeqIdleMicroseconds = 180000000)
-
class Executor
- #include <executor.h>
The executor is responsible for receiving new requests and sending responses, and running the inference.
Public Functions
-
Executor(std::filesystem::path const &modelPath, ModelType modelType, ExecutorConfig const &executorConfig)
- Parameters:
modelPath – Path to the folder that defines the model to run
modelType – The type of model
executorConfig – The configuration for the executor
comm – An optional inter-process communicator configuration
-
Executor(std::filesystem::path const &encoderModelPath, std::filesystem::path const &decoderModelPath, ModelType modelType, ExecutorConfig const &executorConfig)
-
Executor(BufferView const &engineBuffer, std::string const &jsonConfigStr, ModelType modelType, ExecutorConfig const &executorConfig, std::optional<std::map<std::string, Tensor>> const &managedWeights = std::nullopt)
-
Executor(BufferView const &encoderEngineBuffer, std::string const &encoderJsonConfigStr, BufferView const &decoderEngineBuffer, std::string const &decoderJsonConfigStr, ModelType modelType, ExecutorConfig const &executorConfig)
-
~Executor()
-
IdType enqueueRequest(Request const &request)
Enqueue a new request.
- Parameters:
request – The LLM request which contains input tokens and request parameters
- Returns:
A unique id that identifies the request
-
std::vector<IdType> enqueueRequests(std::vector<Request> const &requests)
Enqueue a batch of request.
-
std::vector<Response> awaitResponses(std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)
Await for ready responses.
This overload awaits for any ready responses. In particular, if several requests have been enqueued, this method will provide any ready responses without order guarantees.
- Parameters:
timeout – The maximum time to wait for new responses
- Returns:
A vector of responses
-
std::vector<Response> awaitResponses(IdType const &requestId, std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)
Await for ready responses.
- Parameters:
id – A request id
timeout – The maximum time to wait for new responses
- Returns:
A vector of responses
-
std::vector<std::vector<Response>> awaitResponses(std::vector<IdType> const &requestIds, std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)
Await for multiple ready responses.
A multiple ID request behaves as if awaitResponses(IdType, timeout) were invoked on all IDs. The returned vector contains a vector of responses per ID in the same order specified by the requestIds. The same behaviour as awaitResponses(IdType, timeout) applies: * Responses may be empty. * If all responses have already been given for one of the requestIds, then this method will hang unless a timeout is specified.
- Parameters:
requestIds – Ids requested
timeout – The maximum time to wait for new responses
- Returns:
A vector of vector of responses
-
SizeType32 getNumResponsesReady(std::optional<IdType> const &requestId = std::nullopt) const
Get the number of ready responses.
- Parameters:
requestId – An optional request id
- Returns:
The number of ready responses
-
void cancelRequest(IdType requestId)
Cancel the request with provided request id.
- Parameters:
id – The request id for which to cancel the response
-
void shutdown()
Signals the server to shutdown This call is blocking. Only returns when all requests have terminated or timeout has been reached.
-
std::deque<IterationStats> getLatestIterationStats()
Returns the per-iterations statistics computed since last call to getLatestIterationStats Contains at most iterStatsMaxIterations iterations.
- Returns:
Iteration stats
-
std::deque<RequestStatsPerIteration> getLatestRequestStats()
Returns the request stats of each iteration computed since last call to getLatestRequestStats Contains at most requestStatsMaxIterations iterations.
- Returns:
Request stats grouped by iterations
-
bool canEnqueueRequests() const
Indicates if the current process is allowed to enqueueRequests.
Private Members
-
std::unique_ptr<Impl> mImpl
-
Executor(std::filesystem::path const &modelPath, ModelType modelType, ExecutorConfig const &executorConfig)
-
class JsonSerialization
- #include <executor.h>
Class with utility functions to serialize statistics to json string.
Public Static Functions
-
static std::string toJsonStr(IterationStats const &iterationStats)
Utility function to convert an iterationStats struct to a json serialized string.
-
static std::string toJsonStr(RequestStatsPerIteration const &requestStatsPerIter)
Utility function to convert a requestStatsPerIteration struct to a json serialized string.
-
static std::string toJsonStr(RequestStats const &requestStats)
Utility function to convert a requestStats struct to a json serialized string.
-
static std::string toJsonStr(IterationStats const &iterationStats)
-
char const *version() noexcept
-
namespace mpi
-
namespace executor
serialization.h
-
namespace tensorrt_llm
-
namespace executor
-
class Serialization
Public Static Functions
-
static SamplingConfig deserializeSamplingConfig(std::istream &is)
-
static void serialize(SamplingConfig const &config, std::ostream &os)
-
static size_t serializedSize(SamplingConfig const &config)
-
static OutputConfig deserializeOutputConfig(std::istream &is)
-
static void serialize(OutputConfig const &config, std::ostream &os)
-
static size_t serializedSize(OutputConfig const &config)
-
static ExternalDraftTokensConfig deserializeExternalDraftTokensConfig(std::istream &is)
-
static void serialize(ExternalDraftTokensConfig const &config, std::ostream &os)
-
static size_t serializedSize(ExternalDraftTokensConfig const &config)
-
static PromptTuningConfig deserializePromptTuningConfig(std::istream &is)
-
static void serialize(PromptTuningConfig const &config, std::ostream &os)
-
static size_t serializedSize(PromptTuningConfig const &config)
-
static LoraConfig deserializeLoraConfig(std::istream &is)
-
static void serialize(LoraConfig const &config, std::ostream &os)
-
static size_t serializedSize(LoraConfig const &config)
-
static ContextPhaseState deserializeContextPhaseState(std::istream &is)
-
static void serialize(ContextPhaseState const &contextPhaseState, std::ostream &os)
-
static size_t serializedSize(ContextPhaseState const &contextPhaseState)
-
static ContextPhaseParams deserializeContextPhaseParams(std::istream &is)
-
static void serialize(ContextPhaseParams const &contextPhaseParams, std::ostream &os)
-
static size_t serializedSize(ContextPhaseParams const &contextPhaseParams)
-
static KvCacheConfig deserializeKvCacheConfig(std::istream &is)
-
static void serialize(KvCacheConfig const &kvCacheConfig, std::ostream &os)
-
static size_t serializedSize(KvCacheConfig const &kvCacheConfig)
-
static SchedulerConfig deserializeSchedulerConfig(std::istream &is)
-
static void serialize(SchedulerConfig const &schedulerConfig, std::ostream &os)
-
static size_t serializedSize(SchedulerConfig const &schedulerConfig)
-
static ExtendedRuntimePerfKnobConfig deserializeExtendedRuntimePerfKnobConfig(std::istream &is)
-
static void serialize(ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig, std::ostream &os)
-
static size_t serializedSize(ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig)
-
static ParallelConfig deserializeParallelConfig(std::istream &is)
-
static void serialize(ParallelConfig const ¶llelConfig, std::ostream &os)
-
static size_t serializedSize(ParallelConfig const ¶llelConfig)
-
static PeftCacheConfig deserializePeftCacheConfig(std::istream &is)
-
static void serialize(PeftCacheConfig const &peftCacheConfig, std::ostream &os)
-
static size_t serializedSize(PeftCacheConfig const &peftCacheConfig)
-
static OrchestratorConfig deserializeOrchestratorConfig(std::istream &is)
-
static void serialize(OrchestratorConfig const &orchestratorConfig, std::ostream &os)
-
static size_t serializedSize(OrchestratorConfig const &orchestratorConfig)
-
static DecodingMode deserializeDecodingMode(std::istream &is)
-
static void serialize(DecodingMode const &decodingMode, std::ostream &os)
-
static size_t serializedSize(DecodingMode const &decodingMode)
-
static LookaheadDecodingConfig deserializeLookaheadDecodingConfig(std::istream &is)
-
static void serialize(LookaheadDecodingConfig const &lookaheadDecodingConfig, std::ostream &os)
-
static size_t serializedSize(LookaheadDecodingConfig const &lookaheadDecodingConfig)
-
static DecodingConfig deserializeDecodingConfig(std::istream &is)
-
static void serialize(DecodingConfig const &decodingConfig, std::ostream &os)
-
static size_t serializedSize(DecodingConfig const &decodingConfig)
-
static DebugConfig deserializeDebugConfig(std::istream &is)
-
static void serialize(DebugConfig const &debugConfig, std::ostream &os)
-
static size_t serializedSize(DebugConfig const &debugConfig)
-
static ExecutorConfig deserializeExecutorConfig(std::istream &is)
-
static void serialize(ExecutorConfig const &executorConfig, std::ostream &os)
-
static size_t serializedSize(ExecutorConfig const &executorConfig)
-
static KvCacheStats deserializeKvCacheStats(std::istream &is)
-
static void serialize(KvCacheStats const &kvCacheStats, std::ostream &os)
-
static size_t serializedSize(KvCacheStats const &kvCacheStats)
-
static StaticBatchingStats deserializeStaticBatchingStats(std::istream &is)
-
static void serialize(StaticBatchingStats const &staticBatchingStats, std::ostream &os)
-
static size_t serializedSize(StaticBatchingStats const &staticBatchingStats)
-
static InflightBatchingStats deserializeInflightBatchingStats(std::istream &is)
-
static void serialize(InflightBatchingStats const &inflightBatchingStats, std::ostream &os)
-
static size_t serializedSize(InflightBatchingStats const &inflightBatchingStats)
-
static IterationStats deserializeIterationStats(std::vector<char> &buffer)
-
static IterationStats deserializeIterationStats(std::istream &is)
-
static void serialize(IterationStats const &iterStats, std::ostream &os)
-
static std::vector<char> serialize(IterationStats const &iterStats)
-
static size_t serializedSize(IterationStats const &iterStats)
-
static std::string deserializeString(std::istream &is)
-
static bool deserializeBool(std::istream &is)
-
static SamplingConfig deserializeSamplingConfig(std::istream &is)
-
namespace kv_cache
-
class Serialization
-
namespace executor
tensor.h
-
namespace tensorrt_llm
-
namespace executor
-
class Shape : public tensorrt_llm::common::ArrayView<detail::DimType64 const>
Public Types
-
using Base = tensorrt_llm::common::ArrayView<detail::DimType64 const>
-
using Base = tensorrt_llm::common::ArrayView<detail::DimType64 const>
-
class Tensor
Public Types
-
using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>
Public Functions
-
Tensor copyToCpu(Tensor::CudaStreamPtr stream = nullptr) const
-
Tensor copyToPinned(Tensor::CudaStreamPtr stream = nullptr) const
-
Tensor copyToPooledPinned(Tensor::CudaStreamPtr stream = nullptr) const
-
Tensor copyToManaged(Tensor::CudaStreamPtr stream = nullptr) const
-
Tensor copyToGpu(Tensor::CudaStreamPtr stream) const
-
Tensor() noexcept = default
-
~Tensor() = default
-
void *getData()
Returns a pointer to underlying array.
-
void const *getData() const
Returns a pointer to underlying array.
-
MemoryType getMemoryType() const
Returns the memory type of the buffer.
-
std::size_t getSize() const
Returns the number of elements in the tensor.
-
std::size_t getSizeInBytes() const
Returns the size of the tensor in bytes.
-
void setZero(CudaStreamPtr stream = nullptr)
Set the entire memory to zero.
- Parameters:
stream – Must be a valid CUDA stream if the memory type is GPU.
-
void setFrom(Tensor const &other, CudaStreamPtr stream = nullptr)
Copy the data and shape from another tensor.
- Parameters:
other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.
-
inline explicit operator bool() const
Public Static Functions
-
static Tensor cpu(DataType dataType, Shape shape = {})
Allocate a cpu tensor with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
-
static Tensor pinned(DataType dataType, Shape shape = {})
Allocate a cpu tensor in pinned memory with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
-
static Tensor pooledPinned(DataType dataType, Shape shape = {})
Allocate a cpu tensor in pooled pinned memory with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
-
static Tensor managed(DataType dataType, Shape shape = {})
Allocate a tensor in managed memory (UVM) with the given shape and data type.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
-
static Tensor gpu(DataType dataType, CudaStreamPtr stream, Shape shape = {})
Allocate a gpu tensor with the given shape and data type on a particular cuda stream.
- Parameters:
shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.
-
template<typename T>
static inline Tensor gpu(CudaStreamPtr stream, Shape shape = {})
-
static Tensor of(DataType dataType, void *data, Shape shape)
Wrap a data pointer into a tensor without taking ownership.
- Parameters:
shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Private Functions
-
using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>
-
class Shape : public tensorrt_llm::common::ArrayView<detail::DimType64 const>
-
namespace runtime
-
namespace executor
types.h
-
template<>
struct TypeTraits<std::int8_t>
-
template<>
struct TypeTraits<std::int32_t>
-
template<>
struct TypeTraits<std::int64_t>
-
template<>
struct TypeTraits<std::uint8_t>
-
namespace tensorrt_llm
-
namespace executor
-
namespace executor