Runtime
iStatefulGptDecoder.h
-
namespace tensorrt_llm
-
namespace batch_manager
-
namespace runtime
-
class IStatefulGptDecoder
- #include <iStatefulGptDecoder.h>
GPT decoder class with support for in-flight batching.
Subclassed by tensorrt_llm::runtime::IGptDecoderBatched
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
Public Functions
-
virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) = 0
Setup the decoder before calling
forward()
, also calls reshapeBuffers.
-
virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig) = 0
Initialize the decoder with new batch of inputs.
-
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0
Run one step for all requests without blocking the host thread.
-
virtual void forwardSync() = 0
Wait for the last call to
forwardAsync
to complete.
-
inline virtual void forward(decoder::Output &output, decoder::Input const &input)
Run one step for all requests.
-
virtual void finalize(SamplingConfig const &samplingConfig) const = 0
Gather final beam search results for all requests.
-
virtual TensorPtr getIds() const = 0
- Returns:
[batchSize, beamWidth, maxSequenceLength], all token ids, on gpu
-
virtual TensorPtr getGatheredIds() const = 0
- Returns:
[batchSize, beamWidth, maxSequenceLength] token ids after gatherTree
-
virtual TensorPtr getCumLogProbs() const = 0
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
virtual TensorPtr getLogProbs() const = 0
- Returns:
[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
virtual TensorPtr getNewTokens(SizeType32 iter = 0) const = 0
Get tokens generated in one step of last forward pass.
- Parameters:
iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
- Returns:
[batchSize, beamWidth], tokens generated in
iter
(per beam), on gpu
-
virtual TensorPtr getAllNewTokens() const = 0
Get maxTokensPerStep tokens generated in the last forward pass.
- Returns:
[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
-
virtual TensorPtr getNbFinished() const = 0
- Returns:
[1], number of finished sequences, in pinned host memory
-
virtual ~IStatefulGptDecoder() = default
Protected Functions
-
IStatefulGptDecoder() = default
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
class IStatefulGptDecoder
-
namespace batch_manager
lookaheadModule.h
-
namespace tensorrt_llm
-
namespace runtime
-
class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule
Public Functions
-
inline explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
-
inline explicit LookaheadModule() noexcept
-
inline void setExecutionConfig(executor::LookaheadDecodingConfig const &config)
-
inline executor::LookaheadDecodingConfig const getExecutionConfig() const
Private Members
-
executor::LookaheadDecodingConfig mExecutionConfig
-
inline explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
-
class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule
-
namespace runtime
iTensor.h
-
namespace nvinfer1
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims)
Utility function to print a shape.
-
std::ostream &operator<<(std::ostream &output, ITensor const &tensor)
Utility function to print a tensor with its shape.
Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
tensorPtr – A possibly null shared ptr.
- Returns:
A pointer to T const, possibly nullptr.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
tensorPtr – A possibly null shared ptr.
- Returns:
A pointer to T, possibly nullptr.
Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
optionalBufferPtr – A possibly empty optional.
- Returns:
A pointer to T, possibly nullptr.
Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
optionalBufferPtr – A possibly empty optional.
- Returns:
A pointer to const T, possibly nullptr.
-
class ITensor : public virtual tensorrt_llm::runtime::IBuffer
-
Public Functions
-
~ITensor() override = default
-
template<SizeType32 n>
inline DimType64 getDimension() const Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.
-
virtual void reshape(Shape const &dims) = 0
Sets the tensor dimensions. The new size of the tensor will be
volume(dims)
-
inline virtual void resize(std::size_t newSize) override
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-
inline void squeeze(SizeType32 dim)
Removes the given unit dimensions from this tensor.
-
inline void unsqueeze(SizeType32 dim)
Adds a unit dimension at the specified position.
-
inline bool shapeEquals(std::initializer_list<SizeType32> const &other) const
-
template<typename T>
inline bool shapeEquals(T const *dims, SizeType32 count) const
Public Static Functions
-
static inline std::int64_t volume(Shape const &dims)
Returns the volume of the dimensions. Returns -1 if
d.nbDims < 0
.
-
static inline std::size_t volumeNonNegative(Shape const &shape)
Returns the volume of the dimensions. Throws if
d.nbDims < 0
.
-
static Shape squeeze(Shape const &shape, SizeType32 dim)
Removes the given unit dimension from
shape
.- Parameters:
shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).
- Returns:
A new shape without the unit dimension.
-
static Shape unsqueeze(Shape const &shape, SizeType32 dim)
Add a unit dimension to
shape
at the specified position.- Parameters:
shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.
- Returns:
A new shape with the added unit dimension.
Creates a sliced view on the underlying
tensor
. The view will have the same data type astensor
.- Parameters:
tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.
- Returns:
A view on the
buffer
.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
- Parameters:
offsetDims – The offset in multiple dimensions.
tensor – The tensor to view.
offsetDims – The offset dimensions of the view.
size – The size of the view w.r.t. the last dimension in offsetDims.
offsetDims – specifies all dimensions.
- Throws:
Whenever – offset overflows or the last dimension offset+size overflows.
- Returns:
A view of shape [size, the rest dimensions] or [size] when
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims, std::size_t size)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims, std::size_t size)
return the rest slices at the last dimension when
size
omitted.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)
- Parameters:
offsetDims – specifies all dimensions.
- Returns:
Just the block at the point, with shape of [the rest dimensions] or [1] when
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr at(TConstPtr &&tensor, Shape const &offsetDims)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline ITensor::UniqueConstPtr at(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)
Returns a view on the underlying
buffer
(or tensor) with the given shape.- Parameters:
tensor – The tensor to view.
shape – The shape of the view.
- Returns:
A view on the
tensor
.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)
Returns a view on the underlying
tensor
which can be independently reshaped.- Parameters:
tensor – The tensor to view.
- Returns:
A view on the
tensor
.
Returns a flattened view on the underlying
tensor
which can be independently reshaped.- Parameters:
tensor – The tensor to flatten.
sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.
- Returns:
A flatten view on the
tensor
.
-
static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)
Wraps the given
data
in anITensor
. TheITensor
will not own the underlyingdata
and cannot be reshaped beyondcapacity
.- Parameters:
data – The data to wrap.
type – The data type of the
data
.shape – The shape of the tensor.
capacity – The capacity of the buffer.
- Returns:
An
ITensor
.
-
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)
-
static Shape makeShape(std::initializer_list<DimType64> const &dims)
A convenience function to create a tensor shape with the given dimensions.
-
static std::string toString(Shape const &dims)
A convenience function for converting a tensor shape to a
string
.
-
static inline bool shapeEquals(Shape const &lhs, Shape const &rhs)
A convenience function to compare shapes.
-
template<typename T>
static inline bool shapeEquals(Shape const &lhs, T const *dims, SizeType32 count) A convenience function to compare shapes.
Protected Functions
-
ITensor() = default
Friends
- friend class ITensorBindings
-
~ITensor() override = default
-
inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims)
-
namespace runtime
speculativeDecodingMode.h
-
namespace tensorrt_llm
-
namespace runtime
-
class SpeculativeDecodingMode
Public Types
-
using UnderlyingType = std::uint8_t
Public Functions
-
inline constexpr bool isNone() const
-
inline constexpr bool isDraftTokensExternal() const
-
inline constexpr bool isMedusa() const
-
inline constexpr bool isLookaheadDecoding() const
-
inline constexpr bool isExplicitDraftTokens() const
-
inline constexpr bool isEagle() const
-
inline constexpr bool updatesPositionIds() const
-
inline constexpr bool requiresAttentionMask() const
-
inline constexpr bool predictsDraftTokens() const
-
inline constexpr bool needsKVCacheRewind() const
-
inline constexpr bool variableDraftLength() const
-
inline constexpr bool hasDraftLogits() const
-
inline constexpr bool needsDecoderPrologue() const
-
inline bool operator==(SpeculativeDecodingMode const &other) const
-
inline explicit constexpr SpeculativeDecodingMode(UnderlyingType state)
Public Static Functions
-
static inline constexpr auto None()
-
static inline constexpr auto DraftTokensExternal()
-
static inline constexpr auto Medusa()
-
static inline constexpr auto LookaheadDecoding()
-
static inline constexpr auto ExplicitDraftTokens()
-
static inline constexpr auto Eagle()
Private Functions
-
inline constexpr bool anyBitSet(UnderlyingType bits) const
-
inline constexpr bool allBitSet(UnderlyingType bits) const
Private Members
-
UnderlyingType mState = {kNone}
Private Static Attributes
-
static constexpr UnderlyingType kNone = {1U << 0U}
-
static constexpr UnderlyingType kDraftTokensExternal = {1U << 1U}
-
static constexpr UnderlyingType kMedusa = {1U << 2U}
-
static constexpr UnderlyingType kLookaheadDecoding = {1U << 3U}
-
static constexpr UnderlyingType kExplicitDraftTokens = {1U << 4U}
-
static constexpr UnderlyingType kEagle = {1U << 5U}
-
using UnderlyingType = std::uint8_t
-
class SpeculativeDecodingMode
-
namespace runtime
eagleBuffers.h
-
namespace tensorrt_llm
-
namespace batch_manager
-
namespace runtime
-
class EagleBuffers
Public Types
-
using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>
-
using RequestVector = std::vector<LlmRequestPtr>
-
using SizeType32 = runtime::SizeType32
-
using TensorMap = runtime::StringPtrMap<runtime::ITensor>
Public Functions
-
EagleBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, executor::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)
-
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const &modelConfig)
-
void setFromInputs(RequestVector const &contextRequests, RequestVector const &genRequests, runtime::ITensor const &requestTypes, ITensor const &seqSlots, EagleBuffers::Inputs const &decoderBuffers, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const
-
void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const
Public Members
-
class tensorrt_llm::runtime::EagleBuffers::EngineOutputs engineOutputs
Private Functions
-
template<typename T>
void setFromInputs(RequestVector const &contextRequests, RequestVector const &genRequests, SizeType32 vocabSizePadded, ITensor const &seqSlots, EagleBuffers::Inputs const &draftBuffers, runtime::EagleModule const &eagleModule, runtime::BufferManager const &manager) const
Private Members
-
std::size_t scanTempStorageBytes = {0}
-
std::size_t reduceTempStorageBytes = {0}
-
float mDefaultPosteriorThreshold = {0.09f}
-
bool mDoGreedySampling = {true}
-
class EngineOutputs
Public Members
-
class Inputs
Public Functions
-
void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)
Public Members
-
TensorPtr randomDataValidation
[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]
-
TensorPtr draftTokens
[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-
TensorPtr draftPaths
[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
-
TensorPtr specDecodingPackedMasks
[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
-
void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)
-
using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>
-
class EagleBuffers
-
namespace batch_manager
decodingInput.h
-
namespace tensorrt_llm
-
namespace runtime
-
class DecodingInput
- #include <decodingInput.h>
Represents the inputs to the decoder.
This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such.
Public Functions
-
inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize, TensorConstPtr logits, TensorPtr endIds, TensorConstPtr batchSlots)
Public Members
-
SizeType32 step
The index of the decoding step we are on. Only used in Python runtime.
-
SizeType32 maxLength
The maximum number of tokens to decode.
-
SizeType32 maxAttentionWindow
The maximum length of the attention window to consider while decoding.
-
SizeType32 sinkTokenLength
the number of tokens to use as attention sinks, as described there: https://arxiv.org/html/2309.17453v3
-
SizeType32 batchSize
The number of samples in the batch.
-
SizeType32 maxStopWordsLen
The maximum value in the
stopWordsLens
tensor.
-
SizeType32 maxBadWordsLen
The maximum value in the
badWordsLens
tensor.
-
TensorConstPtr logits
[batchSize, beamWidth, vocabSizePadded], on gpu. Logits are are a probability distribution over the vocabulary, the output of the model.
-
TensorConstPtr endIds
[batchSize * beamWidth], on gpu
-
TensorConstPtr batchSlots
[batchSize], address map of the linear batch id to to the seq slots, int32_t, pinned
-
TensorConstPtr finishReasons
[batchSize, beamWidth], finished states at current iteration. If true for some request, the decoding step of it is skipped, on gpu
-
TensorConstPtr sequenceLimitLength
[batchSize], on gpu. The maximum sequence length for each sequence in the batch.
-
TensorConstPtr embeddingBias
[batchSize, vocabSizePadded], on gpu
-
TensorConstPtr lengths
[batchSize, beamWidth], on gpu
-
TensorConstPtr badWordsPtrs
[batchSize][2, badWordsLength], on gpu
-
TensorConstPtr badWordsLens
[batchSize], on gpu
-
TensorConstPtr stopWordsPtrs
[batchSize][2, stopWordsLength], pinned
-
TensorConstPtr stopWordsLens
[batchSize], pinned
-
TensorConstPtr noRepeatNgramSize
[batchSize], on gpu
-
TensorPtr cacheIndirection
[batchSize, beamWidth, maxSeqLen] - the k/v cache index for beam search, on gpu
-
std::optional<MedusaInputs> medusaInputs
-
std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs
-
std::optional<LookaheadInputs> lookaheadInputs
-
std::optional<ExternalDraftTokensInputs> externalDraftTokensInputs
-
std::optional<EagleInputs> eagleInputs
-
struct EagleInputs
Public Functions
-
inline EagleInputs(TensorConstPtr nextDraftTokens, TensorConstPtr nextDraftLens, TensorConstPtr nextDraftPaths, TensorConstPtr lastDraftTokens, TensorConstPtr lastDraftLens, TensorConstPtr lastDraftPaths, TensorConstPtr acceptedTokens, TensorConstPtr acceptedLens, TensorConstPtr acceptedPathIds, TensorConstPtr chunkedContextNextTokens, TensorConstPtr seqSlots)
Public Members
-
TensorConstPtr nextDraftTokens
[batchSize, maxDecodingDraftTokens]
-
TensorConstPtr nextDraftLens
[batchSize]
-
TensorConstPtr nextDraftPaths
[batchSize, maxDecodingTokens, maxPathLen]
-
TensorConstPtr lastDraftTokens
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr lastDraftLens
[batchSize]
-
TensorConstPtr lastDraftPaths
[batchSize, maxDecodingTokens, maxPathLen]
-
TensorConstPtr acceptedTokens
[batchSize, maxPathLen]
-
TensorConstPtr acceptedLens
[batchSize]
-
TensorConstPtr acceptedPathIds
[batchSize]
-
TensorConstPtr chunkedContextNextTokens
[batchSize]
-
TensorConstPtr seqSlots
[batchSize]
-
inline EagleInputs(TensorConstPtr nextDraftTokens, TensorConstPtr nextDraftLens, TensorConstPtr nextDraftPaths, TensorConstPtr lastDraftTokens, TensorConstPtr lastDraftLens, TensorConstPtr lastDraftPaths, TensorConstPtr acceptedTokens, TensorConstPtr acceptedLens, TensorConstPtr acceptedPathIds, TensorConstPtr chunkedContextNextTokens, TensorConstPtr seqSlots)
-
class ExplicitDraftTokensInputs
Public Members
-
TensorConstPtr nextDraftTokens
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr nextFlatTokens
[batchSize * maxDecodingTokens]
-
TensorConstPtr nextDraftIndices
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr nextDraftProbs
[batchSize, maxNumPaths, maxDraftPathLen, vocabSize]
-
TensorConstPtr lastDraftTokens
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr lastDraftIndices
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr masks
[batchSize, maxDecodingTokens, maxDecodingTokens], bool
-
TensorConstPtr packedPositionIds
[batchSize * maxDecodingTokens]
-
TensorConstPtr bestPathLengths
[batchSize]
-
TensorConstPtr bestPathIndices
[batchSize]
-
TensorConstPtr nextGenerationLengths
[batchSize]
-
TensorConstPtr lastPositionIdsBase
[batchSize]
-
TensorConstPtr lastGenerationLengths
[batchSize]
-
TensorConstPtr maxGenLengthDevice
[1]
-
TensorConstPtr seqSlots
[batchSize]
-
TensorConstPtr nextDraftTokens
-
class ExternalDraftTokensInputs
-
struct LookaheadInputs
-
class MedusaInputs
Public Members
-
TensorConstPtr medusaPaths
[batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu
-
TensorConstPtr medusaTreeIds
[batchSize, maxTokensPerStep], on gpu
-
std::vector<std::vector<TensorPtr>> medusaLogits
[batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu
-
TensorConstPtr medusaTargetTokensPerStep
[batchSize], on gpu
-
TensorConstPtr medusaPaths
-
inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize, TensorConstPtr logits, TensorPtr endIds, TensorConstPtr batchSlots)
-
class DecodingInput
-
namespace runtime
memoryCounters.h
-
namespace tensorrt_llm
-
namespace runtime
-
class MemoryCounters
-
Public Functions
-
MemoryCounters() = default
-
inline SizeType32 getGpu() const
-
inline SizeType32 getCpu() const
-
inline SizeType32 getPinned() const
-
inline SizeType32 getUVM() const
-
inline SizeType32 getPinnedPool() const
-
template<MemoryType T>
inline void allocate(SizeType32 size)
-
void allocate(MemoryType memoryType, SizeType32 size)
-
template<MemoryType T>
inline void deallocate(SizeType32 size)
-
void deallocate(MemoryType memoryType, SizeType32 size)
-
std::string toString() const
Public Static Functions
-
static MemoryCounters &getInstance()
-
static std::string bytesToString(SizeType32 bytes, int precision = 2)
Private Members
-
std::atomic<SizeType32> mGpu = {}
-
std::atomic<SizeType32> mCpu = {}
-
std::atomic<SizeType32> mPinned = {}
-
std::atomic<SizeType32> mUVM = {}
-
std::atomic<SizeType32> mPinnedPool = {}
-
MemoryCounters() = default
-
class MemoryCounters
-
namespace runtime
gptDecoderBatched.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched
- #include <gptDecoderBatched.h>
GPT decoder class with support for in-flight batching.
Public Functions
-
GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream, SpeculativeDecodingMode const &speculativeDecodingMode, nvinfer1::DataType dtype)
-
virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) override
Setup the decoder before calling
forward()
-
virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) override
Setup buffers for ExplicitDraftTokens decoding.
-
virtual void setupEagle(EagleBuffers::Inputs eagleBuffers) override
Setup buffers for Eagle decoding.
-
virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) override
Setup buffers for Lookahead decoding.
-
virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig) override
Initialize the decoder with new batch of inputs.
-
virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs, ModelConfig const &modelConfig) override
Initialize batched decoder at seqSlots with a new
requests
.
-
virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override
Run one step for all requests without blocking the host process and return the token for synchronization.
-
virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent) override
Wait for the call to
forwardAsync
associated with a token to complete.
-
virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent, decoder_batch::Output &output, decoder_batch::Input const &input) override
Call decoder forwardSync and wait for the call to
forwardAsync
associated with a token to complete.
-
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override
Run one step for all requests without blocking the host thread.
-
virtual void forwardSync() override
Wait for the last call to
forwardAsync
to complete.
-
inline virtual std::vector<bool> getFinished() const override
- Returns:
[batchSize], indicators of finished requests
-
inline virtual TensorPtr getFinishReasons() const override
- Returns:
[batchSize, beamWidth], FinishedState value, on gpu
-
inline virtual TensorPtr getIds(SizeType32 batchIdx) const override
- Parameters:
batchIdx – index of the batch
- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
batchIdx
, on gpu. In case of beam search, contains the ungathered data.
-
inline virtual TensorPtr getIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data.
-
inline virtual TensorPtr getGatheredIds(SizeType32 batchIdx) const override
- Parameters:
batchIdx – index of the batch
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request
batchIdx
, on gpu.
-
inline virtual TensorPtr getGatheredIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu
-
virtual CudaEvent finalize(SizeType32 batchSlot, SamplingConfig const &samplingConfig, bool streaming) const override
Gather final beam search results for request
batchSlot
. Result will only be available after event returned.
-
virtual void finalize(SamplingConfig const &samplingConfig) const override
Gather final beam search results for all requests.
-
inline virtual TensorPtr getParentIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
-
inline virtual TensorPtr getCumLogProbs() const override
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
inline virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const override
- Returns:
[maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
inline virtual TensorPtr getLogProbs() const override
- Returns:
[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
inline virtual TensorPtr getLogProbs(SizeType32 batchIdx) const override
- Returns:
[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
inline virtual TensorPtr getAllNewTokens() const override
Get maxTokensPerStep tokens generated in the last forward pass.
- Returns:
[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
-
inline virtual TensorPtr getNewTokens(SizeType32 iter = 0) const override
Get tokens generated in one step of last forward pass.
- Parameters:
iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
- Returns:
[batchSize, beamWidth], tokens generated in
iter
(per beam), on gpu
-
inline virtual std::vector<SizeType32> getNbSteps() const override
- Returns:
[batchSize], the number of generation steps executed on each request
-
inline virtual TensorPtr getNbFinished() const override
- Returns:
[1], number of finished sequences, in pinned host memory
-
inline virtual TensorPtr getNextDraftTokens() const override
- Returns:
[batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu
-
inline virtual TensorPtr getPrevDraftTokensLengths() const override
- Returns:
[batchSize], predicted draft tokens lengths for previous step, on gpu
-
inline virtual TensorPtr getNextDraftTokensLengths() const override
- Returns:
[batchSize], predicted draft tokens lengths for next step, on gpu
-
inline virtual TensorPtr getAcceptedLengthsCumSum() const override
- Returns:
[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu
-
inline virtual TensorPtr getAcceptedPackedPaths() const override
- Returns:
[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
-
inline virtual executor::DecodingMode getDecodingMode() const override
Private Types
-
using GptDecoderPtr = std::unique_ptr<IGptDecoder>
-
using DecodingInputPtr = std::unique_ptr<DecodingInput>
-
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>
Private Functions
-
CudaEvent postProcessRequest(SizeType32 batchIdx, SamplingConfig const &samplingConfig, bool streaming) const
Gather final beam search results for request
batchIdx
.
-
void newRequest(SizeType32 batchSlot, decoder_batch::Request const &request, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig)
Initialize the decoder at
batchSlot
with a newrequest
.
-
void allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype)
Allocate buffers for speculative decoding.
-
void setupSpeculativeDecoding(ModelConfig const &modelConfig)
Setup buffers for speculative decoding.
-
void setupLookahead(ModelConfig const &modelConfig)
Setup buffers for lookahead decoding.
-
void newRequestSpeculativeDecoding(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig)
Setups decoder internal tensors for new speculative decoding request.
-
void newRequestDraftTokensExternal(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig)
Setups decoder internal tensors for new request in Draft model Sps mode.
-
void newRequestMedusa(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Medusa request.
-
void newRequestLookahead(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Lookahead request.
-
void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Explicit draft tokens request.
-
void newRequestEagle(SizeType32 batchIdx, decoder_batch::Request const &request, ModelConfig const &modelConfig)
Setups decoder internal tensors for new Eagle request.
-
void updateFinished(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent)
Updates finished state on host for all active requests.
-
void setExplicitDraftTokensInputs(decoder_batch::Input const &input)
Sets inputs for explicit draft tokens.
-
void setEagleInputs(decoder_batch::Input const &input)
Sets inputs for eagle decoding.
-
void forwardDispatch(decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType)
Calls decoders for tokens per engine step.
-
void forwardDecoder(SizeType32 step, decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType)
Calls decoder for whole batch.
Private Members
-
std::size_t const mVocabSize
-
std::size_t const mVocabSizePadded
-
CudaStreamPtr mRuntimeStream
-
CudaStreamPtr mDecoderStream
-
BufferManager mBufferManager
-
DecoderFinishedEventPtr mDecoderFinishEvent
-
GptDecoderPtr mDecoder
-
DecodingInputPtr mJointDecodingInput
-
DecodingOutputPtr mJointDecodingOutput
-
std::vector<SizeType32> mNbSteps
-
std::vector<bool> mFinished
-
std::vector<SizeType32> mMaxNewTokens
-
std::vector<SizeType32> mBeamWidths
-
std::vector<SizeType32> mNumDecodingEngineTokens
-
SizeType32 mMaxSequenceLength = {}
-
SizeType32 mMaxAttentionWindow = {}
-
SizeType32 mSinkTokenLength = {}
-
SizeType32 mActualBatchSize = {}
-
SizeType32 mMaxDecodingDecoderTokens = {}
-
SizeType32 mMaxDecodingEngineTokens = {}
-
SpeculativeDecodingMode mSpeculativeDecodingMode
-
executor::DecodingMode mDecodingMode = {executor::DecodingMode::Auto()}
-
std::shared_ptr<DecodingOutput::BeamHypotheses> mOutputBeamHypotheses = {nullptr}
-
DecodingOutput::TensorPtr mCumLogProbsTmp
-
SizeType32 mNumSMs
-
GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream, SpeculativeDecodingMode const &speculativeDecodingMode, nvinfer1::DataType dtype)
-
class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched
-
namespace runtime
rawEngine.h
-
namespace tensorrt_llm
-
namespace runtime
-
class RawEngine
Public Types
Public Functions
-
inline explicit RawEngine(std::filesystem::path enginePath) noexcept
-
inline explicit RawEngine(void const *engineAddr, std::size_t engineSize) noexcept
-
inline std::filesystem::path getPath() const
-
inline std::optional<std::filesystem::path> getPathOpt() const
-
inline void setPath(std::filesystem::path enginePath)
-
inline std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> const &getManagedWeightsMapOpt() const
-
inline void setManagedWeightsMap(std::map<std::string, tensorrt_llm::executor::Tensor> managedWeightsMap)
-
inline void const *getAddress() const
-
inline std::size_t getSize() const
-
inline explicit RawEngine(std::filesystem::path enginePath) noexcept
-
class RawEngine
-
namespace runtime
gptSession.h
-
namespace tensorrt_llm
-
namespace batch_manager
-
namespace kv_cache_manager
-
namespace kv_cache_manager
-
namespace runtime
-
class GptSession
-
Public Functions
-
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, RawEngine const &rawEngine, LoggerPtr logger = nullptr)
- Parameters:
sessionConfig – Configuration of the session,
modelConfig – Description of the model,
worldConfig – Description of the environment,
rawEngine – The compiled TensorRT engine,
logger – The optional logger.
-
inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
-
inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)
-
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)
-
BufferManager const &getBufferManager() const
-
BufferManager::CudaStreamPtr getRuntimeStreamPtr() const
-
inline ModelConfig const &getModelConfig() const
-
inline WorldConfig const &getWorldConfig() const
-
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, RawEngine const &rawEngine, LoggerPtr logger = nullptr)
-
class GptSession
-
namespace batch_manager