namespace tensorrt_llm
namespace batch_manager
namespace runtime
class IStatefulGptDecoder
- #include <iStatefulGptDecoder.h>
GPT decoder class with support for in-flight batching.
Subclassed by tensorrt_llm::runtime::IGptDecoderBatched
Public Types
using CudaStreamPtr = std::shared_ptr<CudaStream>
Public Functions
virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) = 0
Setup the decoder before calling
, also calls reshapeBuffers.
virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig) = 0
Initialize the decoder with new batch of inputs.
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0
Run one step for all requests without blocking the host thread.
virtual void forwardSync() = 0
Wait for the last call to
to complete.
inline virtual void forward(decoder::Output &output, decoder::Input const &input)
Run one step for all requests.
virtual void finalize(SamplingConfig const &samplingConfig) const = 0
Gather final beam search results for all requests.
virtual TensorPtr getIds() const = 0
- Returns:
[batchSize, beamWidth, maxSequenceLength], all token ids, on gpu
virtual TensorPtr getGatheredIds() const = 0
- Returns:
[batchSize, beamWidth, maxSequenceLength] token ids after gatherTree
virtual TensorPtr getCumLogProbs() const = 0
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
virtual TensorPtr getLogProbs() const = 0
- Returns:
[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
virtual TensorPtr getNewTokens(SizeType32 iter = 0) const = 0
Get tokens generated in one step of last forward pass.
- Parameters:
iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
- Returns:
[batchSize, beamWidth], tokens generated in
(per beam), on gpu
virtual TensorPtr getAllNewTokens() const = 0
Get maxTokensPerStep tokens generated in the last forward pass.
- Returns:
[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
virtual TensorPtr getNbFinished() const = 0
- Returns:
[1], number of finished sequences, in pinned host memory
virtual ~IStatefulGptDecoder() = default
Protected Functions
IStatefulGptDecoder() = default
class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule
Public Functions
inline explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
inline explicit LookaheadModule() noexcept
inline void setExecutionConfig(executor::LookaheadDecodingConfig const &config)
inline executor::LookaheadDecodingConfig const getExecutionConfig() const
Private Members
executor::LookaheadDecodingConfig mExecutionConfig
inline explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
namespace nvinfer1
namespace tensorrt_llm
namespace runtime
inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims)
Utility function to print a shape.
std::ostream &operator<<(std::ostream &output, ITensor const &tensor)
Utility function to print a tensor with its shape.
Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
tensorPtr – A possibly null shared ptr.
- Returns:
A pointer to T const, possibly nullptr.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
tensorPtr – A possibly null shared ptr.
- Returns:
A pointer to T, possibly nullptr.
Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
optionalBufferPtr – A possibly empty optional.
- Returns:
A pointer to T, possibly nullptr.
Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
optionalBufferPtr – A possibly empty optional.
- Returns:
A pointer to const T, possibly nullptr.
class ITensor : public virtual tensorrt_llm::runtime::IBuffer
Public Functions
~ITensor() override = default
template<SizeType32 n>
inline DimType64 getDimension() const Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.
virtual void reshape(Shape const &dims) = 0
Sets the tensor dimensions. The new size of the tensor will be
inline virtual void resize(std::size_t newSize) override
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
inline void squeeze(SizeType32 dim)
Removes the given unit dimensions from this tensor.
inline void unsqueeze(SizeType32 dim)
Adds a unit dimension at the specified position.
inline bool shapeEquals(std::initializer_list<SizeType32> const &other) const
template<typename T>
inline bool shapeEquals(T const *dims, SizeType32 count) const
Public Static Functions
static inline std::int64_t volume(Shape const &dims)
Returns the volume of the dimensions. Returns -1 if
d.nbDims < 0
static inline std::size_t volumeNonNegative(Shape const &shape)
Returns the volume of the dimensions. Throws if
d.nbDims < 0
static Shape squeeze(Shape const &shape, SizeType32 dim)
Removes the given unit dimension from
.- Parameters:
shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).
- Returns:
A new shape without the unit dimension.
static Shape unsqueeze(Shape const &shape, SizeType32 dim)
Add a unit dimension to
at the specified position.- Parameters:
shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.
- Returns:
A new shape with the added unit dimension.
Creates a sliced view on the underlying
. The view will have the same data type astensor
.- Parameters:
tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.
- Returns:
A view on the
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
- Parameters:
offsetDims – The offset in multiple dimensions.
tensor – The tensor to view.
offsetDims – The offset dimensions of the view.
size – The size of the view w.r.t. the last dimension in offsetDims.
offsetDims – specifies all dimensions.
- Throws:
Whenever – offset overflows or the last dimension offset+size overflows.
- Returns:
A view of shape [size, the rest dimensions] or [size] when
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims, std::size_t size)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims, std::size_t size)
return the rest slices at the last dimension when
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)
- Parameters:
offsetDims – specifies all dimensions.
- Returns:
Just the block at the point, with shape of [the rest dimensions] or [1] when
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr at(TConstPtr &&tensor, Shape const &offsetDims)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline ITensor::UniqueConstPtr at(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)
Returns a view on the underlying
(or tensor) with the given shape.- Parameters:
tensor – The tensor to view.
shape – The shape of the view.
- Returns:
A view on the
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)
Returns a view on the underlying
which can be independently reshaped.- Parameters:
tensor – The tensor to view.
- Returns:
A view on the
Returns a flattened view on the underlying
which can be independently reshaped.- Parameters:
tensor – The tensor to flatten.
sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.
- Returns:
A flatten view on the
static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)
Wraps the given
in anITensor
. TheITensor
will not own the underlyingdata
and cannot be reshaped beyondcapacity
.- Parameters:
data – The data to wrap.
type – The data type of the
.shape – The shape of the tensor.
capacity – The capacity of the buffer.
- Returns:
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)
static Shape makeShape(std::initializer_list<DimType64> const &dims)
A convenience function to create a tensor shape with the given dimensions.
static std::string toString(Shape const &dims)
A convenience function for converting a tensor shape to a
static inline bool shapeEquals(Shape const &lhs, Shape const &rhs)
A convenience function to compare shapes.
template<typename T>
static inline bool shapeEquals(Shape const &lhs, T const *dims, SizeType32 count) A convenience function to compare shapes.
Protected Functions
ITensor() = default
- friend class ITensorBindings
~ITensor() override = default
inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims)
namespace runtime
class SpeculativeDecodingMode
Public Types
using UnderlyingType = std::uint8_t
Public Functions
inline constexpr bool isNone() const
inline constexpr bool isDraftTokensExternal() const
inline constexpr bool isMedusa() const
inline constexpr bool isLookaheadDecoding() const
inline constexpr bool isExplicitDraftTokens() const
inline constexpr bool isEagle() const
inline constexpr bool updatesPositionIds() const
inline constexpr bool requiresAttentionMask() const
inline constexpr bool predictsDraftTokens() const
inline constexpr bool needsKVCacheRewind() const
inline constexpr bool variableDraftLength() const
inline constexpr bool hasDraftLogits() const
inline constexpr bool needsDecoderPrologue() const
inline bool operator==(SpeculativeDecodingMode const &other) const
inline explicit constexpr SpeculativeDecodingMode(UnderlyingType state)
Public Static Functions
static inline constexpr auto None()
static inline constexpr auto DraftTokensExternal()
static inline constexpr auto Medusa()
static inline constexpr auto LookaheadDecoding()
static inline constexpr auto ExplicitDraftTokens()
static inline constexpr auto Eagle()
Private Functions
inline constexpr bool anyBitSet(UnderlyingType bits) const
inline constexpr bool allBitSet(UnderlyingType bits) const
Private Members
UnderlyingType mState = {kNone}
Private Static Attributes
static constexpr UnderlyingType kNone = {1U << 0U}
static constexpr UnderlyingType kDraftTokensExternal = {1U << 1U}
static constexpr UnderlyingType kMedusa = {1U << 2U}
static constexpr UnderlyingType kLookaheadDecoding = {1U << 3U}
static constexpr UnderlyingType kExplicitDraftTokens = {1U << 4U}
static constexpr UnderlyingType kEagle = {1U << 5U}
namespace tensorrt_llm
namespace batch_manager
namespace runtime
class EagleBuffers
Public Types
using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>
using RequestVector = std::vector<LlmRequestPtr>
using SizeType32 = runtime::SizeType32
using TensorMap = runtime::StringPtrMap<runtime::ITensor>
Public Functions
EagleBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, executor::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const &modelConfig)
void setFromInputs(RequestVector const &contextRequests, RequestVector const &genRequests, runtime::ITensor const &requestTypes, ITensor const &seqSlots, EagleBuffers::Inputs const &decoderBuffers, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const
void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const
Public Members
class tensorrt_llm::runtime::EagleBuffers::EngineOutputs engineOutputs
Private Functions
template<typename T>
void setFromInputs(RequestVector const &contextRequests, RequestVector const &genRequests, SizeType32 vocabSizePadded, ITensor const &seqSlots, EagleBuffers::Inputs const &draftBuffers, runtime::EagleModule const &eagleModule, runtime::BufferManager const &manager) const
Private Members
std::size_t scanTempStorageBytes = {0}
std::size_t reduceTempStorageBytes = {0}
float mDefaultPosteriorThreshold = {0.09f}
bool mDoGreedySampling = {true}
class EngineOutputs
Public Members
class Inputs
Public Functions
void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)
Public Members
TensorPtr randomDataValidation
[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]
TensorPtr draftTokens
[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
TensorPtr draftPaths
[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
TensorPtr specDecodingPackedMasks
[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)
namespace runtime
class DecodingInput
- #include <decodingInput.h>
Represents the inputs to the decoder.
This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such.
Public Functions
inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize, TensorConstPtr logits, TensorPtr endIds, TensorConstPtr batchSlots)
Public Members
SizeType32 step
The index of the decoding step we are on. Only used in Python runtime.
SizeType32 maxLength
The maximum number of tokens to decode.
SizeType32 maxAttentionWindow
The maximum length of the attention window to consider while decoding.
SizeType32 sinkTokenLength
the number of tokens to use as attention sinks, as described there:
SizeType32 batchSize
The number of samples in the batch.
SizeType32 maxStopWordsLen
The maximum value in the
SizeType32 maxBadWordsLen
The maximum value in the
TensorConstPtr logits
[batchSize, beamWidth, vocabSizePadded], on gpu. Logits are are a probability distribution over the vocabulary, the output of the model.
TensorConstPtr endIds
[batchSize * beamWidth], on gpu
TensorConstPtr batchSlots
[batchSize], address map of the linear batch id to to the seq slots, int32_t, pinned
TensorConstPtr finishReasons
[batchSize, beamWidth], finished states at current iteration. If true for some request, the decoding step of it is skipped, on gpu
TensorConstPtr sequenceLimitLength
[batchSize], on gpu. The maximum sequence length for each sequence in the batch.
TensorConstPtr embeddingBias
[batchSize, vocabSizePadded], on gpu
TensorConstPtr lengths
[batchSize, beamWidth], on gpu
TensorConstPtr badWordsPtrs
[batchSize][2, badWordsLength], on gpu
TensorConstPtr badWordsLens
[batchSize], on gpu
TensorConstPtr stopWordsPtrs
[batchSize][2, stopWordsLength], pinned
TensorConstPtr stopWordsLens
[batchSize], pinned
TensorConstPtr noRepeatNgramSize
[batchSize], on gpu
TensorPtr cacheIndirection
[batchSize, beamWidth, maxSeqLen] - the k/v cache index for beam search, on gpu
std::optional<MedusaInputs> medusaInputs
std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs
std::optional<LookaheadInputs> lookaheadInputs
std::optional<ExternalDraftTokensInputs> externalDraftTokensInputs
std::optional<EagleInputs> eagleInputs
struct EagleInputs
Public Functions
inline EagleInputs(TensorConstPtr nextDraftTokens, TensorConstPtr nextDraftLens, TensorConstPtr nextDraftPaths, TensorConstPtr lastDraftTokens, TensorConstPtr lastDraftLens, TensorConstPtr lastDraftPaths, TensorConstPtr acceptedTokens, TensorConstPtr acceptedLens, TensorConstPtr acceptedPathIds, TensorConstPtr chunkedContextNextTokens, TensorConstPtr seqSlots)
Public Members
TensorConstPtr nextDraftTokens
[batchSize, maxDecodingDraftTokens]
TensorConstPtr nextDraftLens
TensorConstPtr nextDraftPaths
[batchSize, maxDecodingTokens, maxPathLen]
TensorConstPtr lastDraftTokens
[batchSize, maxNumPaths, maxPathLen]
TensorConstPtr lastDraftLens
TensorConstPtr lastDraftPaths
[batchSize, maxDecodingTokens, maxPathLen]
TensorConstPtr acceptedTokens
[batchSize, maxPathLen]
TensorConstPtr acceptedLens
TensorConstPtr acceptedPathIds
TensorConstPtr chunkedContextNextTokens
TensorConstPtr seqSlots
inline EagleInputs(TensorConstPtr nextDraftTokens, TensorConstPtr nextDraftLens, TensorConstPtr nextDraftPaths, TensorConstPtr lastDraftTokens, TensorConstPtr lastDraftLens, TensorConstPtr lastDraftPaths, TensorConstPtr acceptedTokens, TensorConstPtr acceptedLens, TensorConstPtr acceptedPathIds, TensorConstPtr chunkedContextNextTokens, TensorConstPtr seqSlots)
class ExplicitDraftTokensInputs
Public Members
TensorConstPtr nextDraftTokens
[batchSize, maxNumPaths, maxPathLen]
TensorConstPtr nextFlatTokens
[batchSize * maxDecodingTokens]
TensorConstPtr nextDraftIndices
[batchSize, maxNumPaths, maxPathLen]
TensorConstPtr nextDraftProbs
[batchSize, maxNumPaths, maxDraftPathLen, vocabSize]
TensorConstPtr lastDraftTokens
[batchSize, maxNumPaths, maxPathLen]
TensorConstPtr lastDraftIndices
[batchSize, maxNumPaths, maxPathLen]
TensorConstPtr masks
[batchSize, maxDecodingTokens, maxDecodingTokens], bool
TensorConstPtr packedPositionIds
[batchSize * maxDecodingTokens]
TensorConstPtr bestPathLengths
TensorConstPtr bestPathIndices
TensorConstPtr nextGenerationLengths
TensorConstPtr lastPositionIdsBase
TensorConstPtr lastGenerationLengths
TensorConstPtr maxGenLengthDevice
TensorConstPtr seqSlots
TensorConstPtr nextDraftTokens
class ExternalDraftTokensInputs
struct LookaheadInputs
class MedusaInputs
Public Members
TensorConstPtr medusaPaths
[batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu
TensorConstPtr medusaTreeIds
[batchSize, maxTokensPerStep], on gpu
std::vector<std::vector<TensorPtr>> medusaLogits
[batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu
TensorConstPtr medusaTargetTokensPerStep
[batchSize], on gpu
TensorConstPtr medusaPaths
namespace tensorrt_llm
namespace runtime
class MemoryCounters
Public Functions
MemoryCounters() = default
inline SizeType32 getGpu() const
inline SizeType32 getCpu() const
inline SizeType32 getPinned() const
inline SizeType32 getUVM() const
inline SizeType32 getPinnedPool() const
template<MemoryType T>
inline void allocate(SizeType32 size)
void allocate(MemoryType memoryType, SizeType32 size)
template<MemoryType T>
inline void deallocate(SizeType32 size)
void deallocate(MemoryType memoryType, SizeType32 size)
std::string toString() const
Public Static Functions
static MemoryCounters &getInstance()
static std::string bytesToString(SizeType32 bytes, int precision = 2)
Private Members
std::atomic<SizeType32> mGpu = {}
std::atomic<SizeType32> mCpu = {}
std::atomic<SizeType32> mPinned = {}
std::atomic<SizeType32> mUVM = {}
std::atomic<SizeType32> mPinnedPool = {}
namespace tensorrt_llm
namespace runtime
class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched
- #include <gptDecoderBatched.h>
GPT decoder class with support for in-flight batching.
Public Functions
GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream, SpeculativeDecodingMode const &speculativeDecodingMode, nvinfer1::DataType dtype)
virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) override
Setup the decoder before calling
virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) override
Setup buffers for ExplicitDraftTokens decoding.
virtual void setupEagle(EagleBuffers::Inputs eagleBuffers) override
Setup buffers for Eagle decoding.
virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) override
Setup buffers for Lookahead decoding.
virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig) override
Initialize the decoder with new batch of inputs.
virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs, ModelConfig const &modelConfig) override
Initialize batched decoder at seqSlots with a new
virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override
Run one step for all requests without blocking the host process and return the token for synchronization.
virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent) override
Wait for the call to
associated with a token to complete.
virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent, decoder_batch::Output &output, decoder_batch::Input const &input) override
Call decoder forwardSync and wait for the call to
associated with a token to complete.
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override
Run one step for all requests without blocking the host thread.
virtual void forwardSync() override
Wait for the last call to
to complete.
inline virtual std::vector<bool> getFinished() const override
- Returns:
[batchSize], indicators of finished requests
inline virtual TensorPtr getFinishReasons() const override
- Returns:
[batchSize, beamWidth], FinishedState value, on gpu
inline virtual TensorPtr getIds(SizeType32 batchIdx) const override
- Parameters:
batchIdx – index of the batch
- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
, on gpu. In case of beam search, contains the ungathered data.
inline virtual TensorPtr getIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data.
inline virtual TensorPtr getGatheredIds(SizeType32 batchIdx) const override
- Parameters:
batchIdx – index of the batch
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request
, on gpu.
inline virtual TensorPtr getGatheredIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu
virtual CudaEvent finalize(SizeType32 batchSlot, SamplingConfig const &samplingConfig, bool streaming) const override
Gather final beam search results for request
. Result will only be available after event returned.
virtual void finalize(SamplingConfig const &samplingConfig) const override
Gather final beam search results for all requests.
inline virtual TensorPtr getParentIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
inline virtual TensorPtr getCumLogProbs() const override
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
inline virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const override
- Returns:
[maxBeamWidth], cumulative log probabilities (per beam), on gpu
inline virtual TensorPtr getLogProbs() const override
- Returns:
[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
inline virtual TensorPtr getLogProbs(SizeType32 batchIdx) const override
- Returns:
[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
inline virtual TensorPtr getAllNewTokens() const override
Get maxTokensPerStep tokens generated in the last forward pass.
- Returns:
[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
inline virtual TensorPtr getNewTokens(SizeType32 iter = 0) const override
Get tokens generated in one step of last forward pass.
- Parameters:
iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
- Returns:
[batchSize, beamWidth], tokens generated in
(per beam), on gpu
inline virtual std::vector<SizeType32> getNbSteps() const override
- Returns:
[batchSize], the number of generation steps executed on each request
inline virtual TensorPtr getNbFinished() const override
- Returns:
[1], number of finished sequences, in pinned host memory
inline virtual TensorPtr getNextDraftTokens() const override
- Returns:
[batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu
inline virtual TensorPtr getPrevDraftTokensLengths() const override
- Returns:
[batchSize], predicted draft tokens lengths for previous step, on gpu
inline virtual TensorPtr getNextDraftTokensLengths() const override
- Returns:
[batchSize], predicted draft tokens lengths for next step, on gpu
inline virtual TensorPtr getAcceptedLengthsCumSum() const override
- Returns:
[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu
inline virtual TensorPtr getAcceptedPackedPaths() const override
- Returns:
[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
inline virtual executor::DecodingMode getDecodingMode() const override
Private Types
using GptDecoderPtr = std::unique_ptr<IGptDecoder>
using DecodingInputPtr = std::unique_ptr<DecodingInput>
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>
Private Functions
CudaEvent postProcessRequest(SizeType32 batchIdx, SamplingConfig const &samplingConfig, bool streaming) const
Gather final beam search results for request
void newRequest(SizeType32 batchSlot, decoder_batch::Request const &request, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig)
Initialize the decoder at
with a newrequest
void allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype)
Allocate buffers for speculative decoding.
void setupSpeculativeDecoding(ModelConfig const &modelConfig)
Setup buffers for speculative decoding.
void setupLookahead(ModelConfig const &modelConfig)
Setup buffers for lookahead decoding.
void newRequestSpeculativeDecoding(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig)
Setups decoder internal tensors for new speculative decoding request.
void newRequestDraftTokensExternal(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig)
Setups decoder internal tensors for new request in Draft model Sps mode.
void newRequestMedusa(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Medusa request.
void newRequestLookahead(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Lookahead request.
void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Explicit draft tokens request.
void newRequestEagle(SizeType32 batchIdx, decoder_batch::Request const &request, ModelConfig const &modelConfig)
Setups decoder internal tensors for new Eagle request.
void updateFinished(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent)
Updates finished state on host for all active requests.
void setExplicitDraftTokensInputs(decoder_batch::Input const &input)
Sets inputs for explicit draft tokens.
void setEagleInputs(decoder_batch::Input const &input)
Sets inputs for eagle decoding.
void forwardDispatch(decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType)
Calls decoders for tokens per engine step.
void forwardDecoder(SizeType32 step, decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType)
Calls decoder for whole batch.
Private Members
std::size_t const mVocabSize
std::size_t const mVocabSizePadded
CudaStreamPtr mRuntimeStream
CudaStreamPtr mDecoderStream
BufferManager mBufferManager
DecoderFinishedEventPtr mDecoderFinishEvent
GptDecoderPtr mDecoder
DecodingInputPtr mJointDecodingInput
DecodingOutputPtr mJointDecodingOutput
std::vector<SizeType32> mNbSteps
std::vector<bool> mFinished
std::vector<SizeType32> mMaxNewTokens
std::vector<SizeType32> mBeamWidths
std::vector<SizeType32> mNumDecodingEngineTokens
SizeType32 mMaxSequenceLength = {}
SizeType32 mMaxAttentionWindow = {}
SizeType32 mSinkTokenLength = {}
SizeType32 mActualBatchSize = {}
SizeType32 mMaxDecodingDecoderTokens = {}
SizeType32 mMaxDecodingEngineTokens = {}
SpeculativeDecodingMode mSpeculativeDecodingMode
executor::DecodingMode mDecodingMode = {executor::DecodingMode::Auto()}
std::shared_ptr<DecodingOutput::BeamHypotheses> mOutputBeamHypotheses = {nullptr}
DecodingOutput::TensorPtr mCumLogProbsTmp
SizeType32 mNumSMs
namespace tensorrt_llm
namespace runtime
class RawEngine
Public Types
Public Functions
inline explicit RawEngine(std::filesystem::path enginePath) noexcept
inline explicit RawEngine(void const *engineAddr, std::size_t engineSize) noexcept
inline std::filesystem::path getPath() const
inline std::optional<std::filesystem::path> getPathOpt() const
inline void setPath(std::filesystem::path enginePath)
inline std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> const &getManagedWeightsMapOpt() const
inline void setManagedWeightsMap(std::map<std::string, tensorrt_llm::executor::Tensor> managedWeightsMap)
inline void const *getAddress() const
inline std::size_t getSize() const
namespace tensorrt_llm
namespace batch_manager
namespace kv_cache_manager
namespace kv_cache_manager
namespace runtime
class GptSession
Public Functions
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, RawEngine const &rawEngine, LoggerPtr logger = nullptr)
- Parameters:
sessionConfig – Configuration of the session,
modelConfig – Description of the model,
worldConfig – Description of the environment,
rawEngine – The compiled TensorRT engine,
logger – The optional logger.
inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)
BufferManager const &getBufferManager() const
BufferManager::CudaStreamPtr getRuntimeStreamPtr() const
inline ModelConfig const &getModelConfig() const
inline WorldConfig const &getWorldConfig() const
