Runtime

iStatefulGptDecoder.h

namespace tensorrt_llm
namespace batch_manager
namespace runtime
class IStatefulGptDecoder
#include <iStatefulGptDecoder.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::IGptDecoderBatched

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>
using TensorPtr = std::shared_ptr<ITensor>

Public Functions

virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) = 0

Setup the decoder before calling forward(), also calls reshapeBuffers.

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig) = 0

Initialize the decoder with new batch of inputs.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0

Run one step for all requests without blocking the host thread.

virtual void forwardSync() = 0

Wait for the last call to forwardAsync to complete.

inline virtual void forward(decoder::Output &output, decoder::Input const &input)

Run one step for all requests.

virtual void finalize(SamplingConfig const &samplingConfig) const = 0

Gather final beam search results for all requests.

virtual TensorPtr getIds() const = 0
Returns:

[batchSize, beamWidth, maxSequenceLength], all token ids, on gpu

virtual TensorPtr getGatheredIds() const = 0
Returns:

[batchSize, beamWidth, maxSequenceLength] token ids after gatherTree

virtual TensorPtr getCumLogProbs() const = 0
Returns:

[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

virtual TensorPtr getLogProbs() const = 0
Returns:

[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

virtual TensorPtr getNewTokens(SizeType32 iter = 0) const = 0

Get tokens generated in one step of last forward pass.

Parameters:

iter – The iteration within [0; maxTokensPerStep) for which to get the tokens

Returns:

[batchSize, beamWidth], tokens generated in iter (per beam), on gpu

virtual TensorPtr getAllNewTokens() const = 0

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:

[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

virtual TensorPtr getNbFinished() const = 0
Returns:

[1], number of finished sequences, in pinned host memory

virtual ~IStatefulGptDecoder() = default

Protected Functions

IStatefulGptDecoder() = default
namespace decoder
class Input

Public Types

using TensorPtr = ITensor::SharedPtr

Public Functions

inline explicit Input(TensorPtr logits)

Public Members

TensorPtr logits
TensorPtr cacheIndirection
class Output

Public Types

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

Output() = default

Public Members

TensorPtr cacheIndirection
TensorPtr sequenceLengths

lookaheadModule.h

namespace tensorrt_llm
namespace runtime
class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule

Public Functions

inline explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
inline explicit LookaheadModule() noexcept
inline void setExecutionConfig(executor::LookaheadDecodingConfig const &config)
inline executor::LookaheadDecodingConfig const getExecutionConfig() const

Private Members

executor::LookaheadDecodingConfig mExecutionConfig

iTensor.h

namespace nvinfer1
namespace tensorrt_llm
namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims)

Utility function to print a shape.

std::ostream &operator<<(std::ostream &output, ITensor const &tensor)

Utility function to print a tensor with its shape.

template<typename T>
T const *bufferCastOrNull(ITensor::SharedConstPtr const &tensorPtr)

Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.

This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.

Template Parameters:

T – The type of the underlying data.

Parameters:

tensorPtr – A possibly null shared ptr.

Returns:

A pointer to T const, possibly nullptr.

template<typename T>
T *bufferCastOrNull(ITensor::SharedPtr const &tensorPtr)

Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.

This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.

Template Parameters:

T – The type of the underlying data.

Parameters:

tensorPtr – A possibly null shared ptr.

Returns:

A pointer to T, possibly nullptr.

template<typename T>
T *bufferCastOrNull(std::optional<ITensor::SharedPtr> const &optionalTensorPtr)

Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.

This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.

Template Parameters:

T – The type of the underlying data.

Parameters:

optionalBufferPtr – A possibly empty optional.

Returns:

A pointer to T, possibly nullptr.

template<typename T>
T const *bufferCastOrNull(std::optional<ITensor::SharedConstPtr> const &optionalTensorPtr)

Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.

This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.

Template Parameters:

T – The type of the underlying data.

Parameters:

optionalBufferPtr – A possibly empty optional.

Returns:

A pointer to const T, possibly nullptr.

class ITensor : public virtual tensorrt_llm::runtime::IBuffer

Public Types

using UniquePtr = std::unique_ptr<ITensor>
using SharedPtr = std::shared_ptr<ITensor>
using UniqueConstPtr = std::unique_ptr<ITensor const>
using SharedConstPtr = std::shared_ptr<ITensor const>
using Shape = nvinfer1::Dims
using DimType64 = std::remove_reference_t<decltype(Shape::d[0])>
using TensorMap = runtime::StringPtrMap<runtime::ITensor>

Public Functions

~ITensor() override = default
virtual Shape const &getShape() const = 0

Returns the tensor dimensions.

template<SizeType32 n>
inline DimType64 getDimension() const

Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.

virtual void reshape(Shape const &dims) = 0

Sets the tensor dimensions. The new size of the tensor will be volume(dims)

inline virtual void resize(std::size_t newSize) override

Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

ITensor(ITensor const&) = delete

Not allowed to copy.

ITensor &operator=(ITensor const&) = delete

Not allowed to copy.

inline void squeeze(SizeType32 dim)

Removes the given unit dimensions from this tensor.

inline void unsqueeze(SizeType32 dim)

Adds a unit dimension at the specified position.

inline bool shapeEquals(Shape const &other) const
inline bool shapeEquals(std::initializer_list<SizeType32> const &other) const
template<typename T>
inline bool shapeEquals(T const *dims, SizeType32 count) const

Public Static Functions

static inline std::int64_t volume(Shape const &dims)

Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.

static inline std::size_t volumeNonNegative(Shape const &shape)

Returns the volume of the dimensions. Throws if d.nbDims < 0.

static inline Shape strides(Shape const &dims)

Returns the strides of each dimemsion in a Shape.

static Shape squeeze(Shape const &shape, SizeType32 dim)

Removes the given unit dimension from shape.

Parameters:
  • shape – The shape to squeeze.

  • dim – The dimension that should be removed (“squeezed”).

Returns:

A new shape without the unit dimension.

static Shape unsqueeze(Shape const &shape, SizeType32 dim)

Add a unit dimension to shape at the specified position.

Parameters:
  • shape – The shape to unsqueeze.

  • dim – The dimension where unit dimension should be added.

Returns:

A new shape with the added unit dimension.

static UniquePtr slice(SharedPtr tensor, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.

Parameters:
  • tensor – The tensor to view.

  • offset – The offset of the view w.r.t. dimension 0 of the tensor.

  • size – The size of the view w.r.t. dimension 0 of the tensor.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
static inline UniquePtr slice(SharedPtr tensor, std::size_t offset)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
static UniquePtr slice(SharedPtr tensor, Shape const &offsetDims, DimType64 size)
Parameters:
  • offsetDims – The offset in multiple dimensions.

  • tensor – The tensor to view.

  • offsetDims – The offset dimensions of the view.

  • size – The size of the view w.r.t. the last dimension in offsetDims.

  • offsetDims – specifies all dimensions.

Throws:

Whenever – offset overflows or the last dimension offset+size overflows.

Returns:

A view of shape [size, the rest dimensions] or [size] when

static inline UniquePtr slice(SharedPtr tensor, std::initializer_list<DimType64> const &offsetDims, DimType64 size)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims, std::size_t size)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims, std::size_t size)
static inline UniquePtr slice(SharedPtr tensor, Shape const &offsetDims)

return the rest slices at the last dimension when size omitted.

static inline UniquePtr slice(SharedPtr tensor, std::initializer_list<DimType64> const &offsetDims)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)
static inline UniquePtr at(SharedPtr tensor, Shape const &offsetDims)
Parameters:

offsetDims – specifies all dimensions.

Returns:

Just the block at the point, with shape of [the rest dimensions] or [1] when

static inline UniquePtr at(SharedPtr tensor, std::initializer_list<DimType64> const &offsetDims)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr at(TConstPtr &&tensor, Shape const &offsetDims)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline ITensor::UniqueConstPtr at(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)
static UniquePtr view(IBuffer::SharedPtr buffer, Shape const &dims)

Returns a view on the underlying buffer (or tensor) with the given shape.

Parameters:
  • tensor – The tensor to view.

  • shape – The shape of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)
static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently reshaped.

Parameters:

tensor – The tensor to view.

Returns:

A view on the tensor.

static inline UniquePtr flattenN(SharedPtr tensor, std::int64_t sliceN = -1)

Returns a flattened view on the underlying tensor which can be independently reshaped.

Parameters:
  • tensor – The tensor to flatten.

  • sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.

Returns:

A flatten view on the tensor.

static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)

Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.

Parameters:
  • data – The data to wrap.

  • type – The data type of the data.

  • shape – The shape of the tensor.

  • capacity – The capacity of the buffer.

Returns:

An ITensor.

static inline UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape)
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape)
template<typename T>
static inline UniquePtr wrap(std::vector<T> &v, Shape const &shape)
static Shape makeShape(std::initializer_list<DimType64> const &dims)

A convenience function to create a tensor shape with the given dimensions.

static std::string toString(Shape const &dims)

A convenience function for converting a tensor shape to a string.

static inline bool shapeEquals(Shape const &lhs, Shape const &rhs)

A convenience function to compare shapes.

template<typename T>
static inline bool shapeEquals(Shape const &lhs, T const *dims, SizeType32 count)

A convenience function to compare shapes.

Protected Functions

ITensor() = default

Protected Static Functions

static inline DimType64 castSize(size_t newSize)

Friends

friend class ITensorBindings

speculativeDecodingMode.h

namespace tensorrt_llm
namespace runtime
class SpeculativeDecodingMode

Public Types

using UnderlyingType = std::uint8_t

Public Functions

inline constexpr bool isNone() const
inline constexpr bool isDraftTokensExternal() const
inline constexpr bool isMedusa() const
inline constexpr bool isLookaheadDecoding() const
inline constexpr bool isExplicitDraftTokens() const
inline constexpr bool isEagle() const
inline constexpr bool updatesPositionIds() const
inline constexpr bool requiresAttentionMask() const
inline constexpr bool predictsDraftTokens() const
inline constexpr bool needsKVCacheRewind() const
inline constexpr bool variableDraftLength() const
inline constexpr bool hasDraftLogits() const
inline constexpr bool needsDecoderPrologue() const
inline bool operator==(SpeculativeDecodingMode const &other) const
inline explicit constexpr SpeculativeDecodingMode(UnderlyingType state)

Public Static Functions

static inline constexpr auto None()
static inline constexpr auto DraftTokensExternal()
static inline constexpr auto Medusa()
static inline constexpr auto LookaheadDecoding()
static inline constexpr auto ExplicitDraftTokens()
static inline constexpr auto Eagle()

Private Functions

inline constexpr bool anyBitSet(UnderlyingType bits) const
inline constexpr bool allBitSet(UnderlyingType bits) const

Private Members

UnderlyingType mState = {kNone}

Private Static Attributes

static constexpr UnderlyingType kNone = {1U << 0U}
static constexpr UnderlyingType kDraftTokensExternal = {1U << 1U}
static constexpr UnderlyingType kMedusa = {1U << 2U}
static constexpr UnderlyingType kLookaheadDecoding = {1U << 3U}
static constexpr UnderlyingType kExplicitDraftTokens = {1U << 4U}
static constexpr UnderlyingType kEagle = {1U << 5U}

eagleBuffers.h

namespace tensorrt_llm
namespace batch_manager
namespace runtime
class EagleBuffers

Public Types

using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>
using RequestVector = std::vector<LlmRequestPtr>
using SizeType32 = runtime::SizeType32
using ITensor = runtime::ITensor
using BufferPtr = runtime::IBuffer::SharedPtr
using TensorPtr = runtime::ITensor::SharedPtr
using TensorMap = runtime::StringPtrMap<runtime::ITensor>

Public Functions

EagleBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, executor::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const &modelConfig)
void setFromInputs(RequestVector const &contextRequests, RequestVector const &genRequests, runtime::ITensor const &requestTypes, ITensor const &seqSlots, EagleBuffers::Inputs const &decoderBuffers, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const
void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const

Public Members

Inputs engineInputs
class tensorrt_llm::runtime::EagleBuffers::EngineOutputs engineOutputs

Private Functions

template<typename T>
void setFromInputs(RequestVector const &contextRequests, RequestVector const &genRequests, SizeType32 vocabSizePadded, ITensor const &seqSlots, EagleBuffers::Inputs const &draftBuffers, runtime::EagleModule const &eagleModule, runtime::BufferManager const &manager) const

Private Members

std::size_t scanTempStorageBytes = {0}
std::size_t reduceTempStorageBytes = {0}
float mDefaultPosteriorThreshold = {0.09f}
bool mDoGreedySampling = {true}
BufferPtr scanReduceTempStorage
TensorPtr cumSumGenerationLengths
TensorPtr maxGenerationLength
TensorPtr chunkedContextNextTokensHost
TensorPtr greedySamplingHost
TensorPtr posteriorAlphaHost
TensorPtr posteriorThresholdHost
class EngineOutputs

Public Members

TensorPtr nextDraftTokens

[batchSize, maxDecodingDraftTokens]

TensorPtr nextDraftLens

[batchSize]

TensorPtr nextDraftPaths

[batchSize, maxNumPaths, maxPathLen]

TensorPtr acceptedTokens

[batchSize, maxPathLen]

TensorPtr acceptedLens

[batchSize]

TensorPtr acceptedPaths

[batchSize]

TensorPtr chunkedContextNextTokens

[batchSize]

class Inputs

Public Functions

void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)

Public Members

TensorPtr temperatures

[maxBatchSize] or [numSequences]

TensorPtr posteriorAlpha

[maxBatchSize] or [numSequences]

TensorPtr posteriorThreshold

[maxBatchSize] or [numSequences]

TensorPtr randomDataSample

[maxBatchSize] or [numSequences]

TensorPtr randomDataValidation

[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]

TensorPtr draftTokens

[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]

TensorPtr draftLens

[maxBatchSize] or [numSequences]

TensorPtr draftPaths

[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]

TensorPtr specDecodingGenerationLengths

[maxBatchSize] or [numGenSequences]

TensorPtr specDecodingGenerationLengthsHost

[maxBatchSize] or [numGenSequences]

TensorPtr specDecodingPackedMasks

[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]

TensorPtr specDecodingPositionOffsets

[maxBatchSize] or [numGenSequences]

TensorPtr eagleNetCtxRequestTypesHost

[maxBatchSize] or [numSequences]

TensorPtr eagleNetCtxContextLengthsHost

[maxBatchSize] or [numSequences]

TensorPtr eagleNetCtxPastKeyValueLengthsHost

[maxBatchSize] or [numSequences]

TensorPtr eagleNetGenRequestTypesHost

[maxBatchSize] or [numSequences]

TensorPtr eagleNetGenContextLengthsHost

[maxBatchSize] or [numSequences]

TensorPtr eagleNetGenPastKeyValueLengthsHost

[maxBatchSize] or [numSequences]

TensorPtr inputGenTokensHost

[maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens]

TensorPtr chunkedContextNextTokens

[maxBatchSize] or [numSequences]

TensorPtr useDynamicTreeHost

[1]

decodingInput.h

namespace tensorrt_llm
namespace runtime
class DecodingInput
#include <decodingInput.h>

Represents the inputs to the decoder.

This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such.

Public Types

using TensorConstPtr = ITensor::SharedConstPtr
using TensorPtr = ITensor::SharedPtr

Public Functions

inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize, TensorConstPtr logits, TensorPtr endIds, TensorConstPtr batchSlots)

Public Members

SizeType32 step

The index of the decoding step we are on. Only used in Python runtime.

SizeType32 maxLength

The maximum number of tokens to decode.

SizeType32 maxAttentionWindow

The maximum length of the attention window to consider while decoding.

SizeType32 sinkTokenLength

the number of tokens to use as attention sinks, as described there: https://arxiv.org/html/2309.17453v3

SizeType32 batchSize

The number of samples in the batch.

SizeType32 maxStopWordsLen

The maximum value in the stopWordsLens tensor.

SizeType32 maxBadWordsLen

The maximum value in the badWordsLens tensor.

TensorConstPtr logits

[batchSize, beamWidth, vocabSizePadded], on gpu. Logits are are a probability distribution over the vocabulary, the output of the model.

TensorConstPtr endIds

[batchSize * beamWidth], on gpu

TensorConstPtr batchSlots

[batchSize], address map of the linear batch id to to the seq slots, int32_t, pinned

TensorConstPtr finishReasons

[batchSize, beamWidth], finished states at current iteration. If true for some request, the decoding step of it is skipped, on gpu

TensorConstPtr sequenceLimitLength

[batchSize], on gpu. The maximum sequence length for each sequence in the batch.

TensorConstPtr embeddingBias

[batchSize, vocabSizePadded], on gpu

TensorConstPtr lengths

[batchSize, beamWidth], on gpu

std::vector<TensorPtr> badWordsLists
TensorConstPtr badWordsPtrs

[batchSize][2, badWordsLength], on gpu

TensorConstPtr badWordsLens

[batchSize], on gpu

std::vector<TensorPtr> stopWordsLists
TensorConstPtr stopWordsPtrs

[batchSize][2, stopWordsLength], pinned

TensorConstPtr stopWordsLens

[batchSize], pinned

TensorConstPtr noRepeatNgramSize

[batchSize], on gpu

TensorPtr cacheIndirection

[batchSize, beamWidth, maxSeqLen] - the k/v cache index for beam search, on gpu

std::optional<MedusaInputs> medusaInputs
std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs
std::optional<LookaheadInputs> lookaheadInputs
std::optional<ExternalDraftTokensInputs> externalDraftTokensInputs
std::optional<EagleInputs> eagleInputs
struct EagleInputs

Public Functions

inline EagleInputs(TensorConstPtr nextDraftTokens, TensorConstPtr nextDraftLens, TensorConstPtr nextDraftPaths, TensorConstPtr lastDraftTokens, TensorConstPtr lastDraftLens, TensorConstPtr lastDraftPaths, TensorConstPtr acceptedTokens, TensorConstPtr acceptedLens, TensorConstPtr acceptedPathIds, TensorConstPtr chunkedContextNextTokens, TensorConstPtr seqSlots)

Public Members

TensorConstPtr nextDraftTokens

[batchSize, maxDecodingDraftTokens]

TensorConstPtr nextDraftLens

[batchSize]

TensorConstPtr nextDraftPaths

[batchSize, maxDecodingTokens, maxPathLen]

TensorConstPtr lastDraftTokens

[batchSize, maxNumPaths, maxPathLen]

TensorConstPtr lastDraftLens

[batchSize]

TensorConstPtr lastDraftPaths

[batchSize, maxDecodingTokens, maxPathLen]

TensorConstPtr acceptedTokens

[batchSize, maxPathLen]

TensorConstPtr acceptedLens

[batchSize]

TensorConstPtr acceptedPathIds

[batchSize]

TensorConstPtr chunkedContextNextTokens

[batchSize]

TensorConstPtr seqSlots

[batchSize]

class ExplicitDraftTokensInputs

Public Members

TensorConstPtr nextDraftTokens

[batchSize, maxNumPaths, maxPathLen]

TensorConstPtr nextFlatTokens

[batchSize * maxDecodingTokens]

TensorConstPtr nextDraftIndices

[batchSize, maxNumPaths, maxPathLen]

TensorConstPtr nextDraftProbs

[batchSize, maxNumPaths, maxDraftPathLen, vocabSize]

TensorConstPtr lastDraftTokens

[batchSize, maxNumPaths, maxPathLen]

TensorConstPtr lastDraftIndices

[batchSize, maxNumPaths, maxPathLen]

TensorConstPtr masks

[batchSize, maxDecodingTokens, maxDecodingTokens], bool

TensorConstPtr packedPositionIds

[batchSize * maxDecodingTokens]

TensorConstPtr bestPathLengths

[batchSize]

TensorConstPtr bestPathIndices

[batchSize]

TensorConstPtr nextGenerationLengths

[batchSize]

TensorConstPtr lastPositionIdsBase

[batchSize]

TensorConstPtr lastGenerationLengths

[batchSize]

TensorConstPtr maxGenLengthDevice

[1]

TensorConstPtr seqSlots

[batchSize]

class ExternalDraftTokensInputs

Public Members

TensorPtr draftLogits
TensorPtr draftProbs
TensorPtr targetProbs
TensorPtr numDraftTokens
TensorPtr draftTokenIds
TensorPtr useDraftLogits
TensorPtr useDraftLogitsHost
SizeType32 step
float constantThreshold
bool useRandomAcceptanceThreshold
struct LookaheadInputs

Public Members

TensorPtr tokensPerStep
class MedusaInputs

Public Members

TensorConstPtr medusaPaths

[batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu

TensorConstPtr medusaTreeIds

[batchSize, maxTokensPerStep], on gpu

std::vector<std::vector<TensorPtr>> medusaLogits

[batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu

TensorPtr medusaCurTokensPerStep

[batchSize], on gpu

TensorConstPtr medusaTargetTokensPerStep

[batchSize], on gpu

memoryCounters.h

namespace tensorrt_llm
namespace runtime
class MemoryCounters

Public Types

using SizeType32 = std::size_t
using DiffType = std::ptrdiff_t

Public Functions

MemoryCounters() = default
inline SizeType32 getGpu() const
inline SizeType32 getCpu() const
inline SizeType32 getPinned() const
inline SizeType32 getUVM() const
inline SizeType32 getPinnedPool() const
inline DiffType getGpuDiff() const
inline DiffType getCpuDiff() const
inline DiffType getPinnedDiff() const
inline DiffType getUVMDiff() const
inline DiffType getPinnedPoolDiff() const
template<MemoryType T>
inline void allocate(SizeType32 size)
void allocate(MemoryType memoryType, SizeType32 size)
template<MemoryType T>
inline void deallocate(SizeType32 size)
void deallocate(MemoryType memoryType, SizeType32 size)
std::string toString() const

Public Static Functions

static MemoryCounters &getInstance()
static std::string bytesToString(SizeType32 bytes, int precision = 2)
static std::string bytesToString(DiffType bytes, int precision = 2)

Private Members

std::atomic<SizeType32> mGpu = {}
std::atomic<SizeType32> mCpu = {}
std::atomic<SizeType32> mPinned = {}
std::atomic<SizeType32> mUVM = {}
std::atomic<SizeType32> mPinnedPool = {}
std::atomic<DiffType> mGpuDiff = {}
std::atomic<DiffType> mCpuDiff = {}
std::atomic<DiffType> mPinnedDiff = {}
std::atomic<DiffType> mUVMDiff = {}
std::atomic<DiffType> mPinnedPoolDiff = {}

gptDecoderBatched.h

namespace tensorrt_llm
namespace runtime
class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched
#include <gptDecoderBatched.h>

GPT decoder class with support for in-flight batching.

Public Types

enum class ForwardType

Values:

enumerator kASYNC
enumerator kSYNC
using CudaStreamPtr = std::shared_ptr<CudaStream>
using TensorPtr = ITensor::SharedPtr
using SharedConstPtr = ITensor::SharedConstPtr

Public Functions

GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream, SpeculativeDecodingMode const &speculativeDecodingMode, nvinfer1::DataType dtype)
virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) override

Setup the decoder before calling forward()

virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) override

Setup buffers for ExplicitDraftTokens decoding.

virtual void setupEagle(EagleBuffers::Inputs eagleBuffers) override

Setup buffers for Eagle decoding.

virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) override

Setup buffers for Lookahead decoding.

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig) override

Initialize the decoder with new batch of inputs.

virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs, ModelConfig const &modelConfig) override

Initialize batched decoder at seqSlots with a new requests.

virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override

Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent) override

Wait for the call to forwardAsync associated with a token to complete.

virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent, decoder_batch::Output &output, decoder_batch::Input const &input) override

Call decoder forwardSync and wait for the call to forwardAsync associated with a token to complete.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override

Run one step for all requests without blocking the host thread.

virtual void forwardSync() override

Wait for the last call to forwardAsync to complete.

inline virtual std::vector<bool> getFinished() const override
Returns:

[batchSize], indicators of finished requests

inline virtual TensorPtr getFinishReasons() const override
Returns:

[batchSize, beamWidth], FinishedState value, on gpu

inline virtual TensorPtr getIds(SizeType32 batchIdx) const override
Parameters:

batchIdx – index of the batch

Returns:

[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu. In case of beam search, contains the ungathered data.

inline virtual TensorPtr getIds() const override
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data.

inline virtual TensorPtr getGatheredIds(SizeType32 batchIdx) const override
Parameters:

batchIdx – index of the batch

Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request batchIdx, on gpu.

inline virtual TensorPtr getGatheredIds() const override
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu

virtual CudaEvent finalize(SizeType32 batchSlot, SamplingConfig const &samplingConfig, bool streaming) const override

Gather final beam search results for request batchSlot. Result will only be available after event returned.

virtual void finalize(SamplingConfig const &samplingConfig) const override

Gather final beam search results for all requests.

inline virtual TensorPtr getParentIds() const override
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu

inline virtual TensorPtr getCumLogProbs() const override
Returns:

[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const override
Returns:

[maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs() const override
Returns:

[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs(SizeType32 batchIdx) const override
Returns:

[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getAllNewTokens() const override

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:

[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

inline virtual TensorPtr getNewTokens(SizeType32 iter = 0) const override

Get tokens generated in one step of last forward pass.

Parameters:

iter – The iteration within [0; maxTokensPerStep) for which to get the tokens

Returns:

[batchSize, beamWidth], tokens generated in iter (per beam), on gpu

inline virtual std::vector<SizeType32> getNbSteps() const override
Returns:

[batchSize], the number of generation steps executed on each request

inline virtual TensorPtr getNbFinished() const override
Returns:

[1], number of finished sequences, in pinned host memory

inline virtual TensorPtr getNextDraftTokens() const override
Returns:

[batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu

inline virtual TensorPtr getPrevDraftTokensLengths() const override
Returns:

[batchSize], predicted draft tokens lengths for previous step, on gpu

inline virtual TensorPtr getNextDraftTokensLengths() const override
Returns:

[batchSize], predicted draft tokens lengths for next step, on gpu

inline virtual TensorPtr getAcceptedLengthsCumSum() const override
Returns:

[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu

inline virtual TensorPtr getAcceptedPackedPaths() const override
Returns:

[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu

inline virtual executor::DecodingMode getDecodingMode() const override

Private Types

using GptDecoderPtr = std::unique_ptr<IGptDecoder>
using DecodingInputPtr = std::unique_ptr<DecodingInput>
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>

Private Functions

CudaEvent postProcessRequest(SizeType32 batchIdx, SamplingConfig const &samplingConfig, bool streaming) const

Gather final beam search results for request batchIdx.

void newRequest(SizeType32 batchSlot, decoder_batch::Request const &request, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig)

Initialize the decoder at batchSlot with a new request.

void allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype)

Allocate buffers for speculative decoding.

void setupSpeculativeDecoding(ModelConfig const &modelConfig)

Setup buffers for speculative decoding.

void setupLookahead(ModelConfig const &modelConfig)

Setup buffers for lookahead decoding.

void newRequestSpeculativeDecoding(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig)

Setups decoder internal tensors for new speculative decoding request.

void newRequestDraftTokensExternal(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig)

Setups decoder internal tensors for new request in Draft model Sps mode.

void newRequestMedusa(SizeType32 batchIdx, decoder_batch::Request const &request)

Setups decoder internal tensors for new Medusa request.

void newRequestLookahead(SizeType32 batchIdx, decoder_batch::Request const &request)

Setups decoder internal tensors for new Lookahead request.

void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::Request const &request)

Setups decoder internal tensors for new Explicit draft tokens request.

void newRequestEagle(SizeType32 batchIdx, decoder_batch::Request const &request, ModelConfig const &modelConfig)

Setups decoder internal tensors for new Eagle request.

void updateFinished(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent)

Updates finished state on host for all active requests.

void setExplicitDraftTokensInputs(decoder_batch::Input const &input)

Sets inputs for explicit draft tokens.

void setEagleInputs(decoder_batch::Input const &input)

Sets inputs for eagle decoding.

void forwardDispatch(decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType)

Calls decoders for tokens per engine step.

void forwardDecoder(SizeType32 step, decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType)

Calls decoder for whole batch.

Private Members

std::size_t const mVocabSize
std::size_t const mVocabSizePadded
CudaStreamPtr mRuntimeStream
CudaStreamPtr mDecoderStream
BufferManager mBufferManager
DecoderFinishedEventPtr mDecoderFinishEvent
CudaEvent mForwardEvent
GptDecoderPtr mDecoder
DecodingInputPtr mJointDecodingInput
DecodingOutputPtr mJointDecodingOutput
std::vector<SizeType32> mNbSteps
std::vector<bool> mFinished
TensorPtr mFinishedSum
std::vector<SizeType32> mMaxNewTokens
std::vector<SizeType32> mBeamWidths
std::vector<SizeType32> mNumDecodingEngineTokens
TensorPtr mFinishedSteps
TensorPtr mBatchSlotsSetup
TensorPtr mBatchSlotsDecoder
SizeType32 mMaxSequenceLength = {}
SizeType32 mMaxAttentionWindow = {}
SizeType32 mSinkTokenLength = {}
SizeType32 mActualBatchSize = {}
SizeType32 mMaxDecodingDecoderTokens = {}
SizeType32 mMaxDecodingEngineTokens = {}
SpeculativeDecodingMode mSpeculativeDecodingMode
executor::DecodingMode mDecodingMode = {executor::DecodingMode::Auto()}
std::shared_ptr<DecodingOutput::BeamHypotheses> mOutputBeamHypotheses = {nullptr}
DecodingOutput::TensorPtr mCumLogProbsTmp
SizeType32 mNumSMs

rawEngine.h

namespace tensorrt_llm
namespace runtime
class RawEngine

Public Types

enum Type

Values:

enumerator FilePath
enumerator AddressWithSize
enumerator HostMemory

Public Functions

inline explicit RawEngine(std::filesystem::path enginePath) noexcept
inline explicit RawEngine(void const *engineAddr, std::size_t engineSize) noexcept
inline explicit RawEngine(nvinfer1::IHostMemory const *engineBuffer) noexcept
inline Type getType() const
inline std::filesystem::path getPath() const
inline std::optional<std::filesystem::path> getPathOpt() const
inline void setPath(std::filesystem::path enginePath)
inline std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> const &getManagedWeightsMapOpt() const
inline void setManagedWeightsMap(std::map<std::string, tensorrt_llm::executor::Tensor> managedWeightsMap)
inline void const *getAddress() const
inline std::size_t getSize() const
inline nvinfer1::IHostMemory const *getHostMemory() const

Public Members

void const *mEngineAddr = {}
std::size_t mEngineSize = {}

Private Members

Type mType
std::optional<std::filesystem::path> mEnginePath
struct tensorrt_llm::runtime::RawEngine::[anonymous] [anonymous]
nvinfer1::IHostMemory const *mEngineBuffer = {}
std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> mManagedWeightsMap

gptSession.h

namespace tensorrt_llm
namespace batch_manager
namespace kv_cache_manager
namespace runtime
class GptSession

Public Types

using LoggerPtr = std::shared_ptr<nvinfer1::ILogger>

Public Functions

GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, RawEngine const &rawEngine, LoggerPtr logger = nullptr)
Parameters:
  • sessionConfig – Configuration of the session,

  • modelConfig – Description of the model,

  • worldConfig – Description of the environment,

  • rawEngine – The compiled TensorRT engine,

  • logger – The optional logger.

inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)
nvinfer1::ILogger &getLogger() const
BufferManager const &getBufferManager() const
BufferManager::CudaStreamPtr getRuntimeStreamPtr() const
inline ModelConfig const &getModelConfig() const
inline WorldConfig const &getWorldConfig() const
inline int getDevice() const noexcept