Runtime

bufferManager.h

namespace tensorrt_llm
namespace runtime
class BufferManager
#include <bufferManager.h>

A helper class for managing memory on host and device.

Public Types

using IBufferPtr = IBuffer::UniquePtr
using ITensorPtr = ITensor::UniquePtr
using CudaStreamPtr = std::shared_ptr<CudaStream>
using CudaMemPoolPtr = std::shared_ptr<CudaMemPool>

Public Functions

explicit BufferManager(CudaStreamPtr stream, bool trimPool = false)

Construct a BufferManager.

Parameters:

cudaStream[in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).

inline ~BufferManager()

Destructor.

IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.

ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.

IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an IBuffer of the given size and memory type.

ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an ITensor of the given dimensions and memory type.

inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const

Create an empty IBuffer of the given memory type. It may be resized later.

inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const

Create an empty ITensor of the given memory type. It may be reshaped later.

void setMem(IBuffer &buffer, int32_t value) const

Set the contents of the given buffer to value.

void setZero(IBuffer &buffer) const

Set the contents of the given buffer to zero.

void copy(void const *src, IBuffer &dst, MemoryType srcType) const

Copy src to dst.

void copy(IBuffer const &src, void *dst, MemoryType dstType) const

Copy src to dst.

inline void copy(void const *src, IBuffer &dst) const

Copy src to dst.

inline void copy(IBuffer const &src, void *dst) const

Copy src to dst.

void copy(IBuffer const &src, IBuffer &dst) const

Copy src to dst.

IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const

Copy src into a new IBuffer with a potentially different memory type.

ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const

Copy src into a new ITensor with a potentially different memory type.

template<typename T>
inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const

Copy src into a new IBuffer with a potentially different memory type.

template<typename T>
inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const

Copy src into a new ITensor with a potentially different memory type.

template<typename T>
inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const

Copy src into a new ITensor with a potentially different memory type.

CudaStream const &getStream() const

Get the underlying cuda stream.

std::size_t memoryPoolReserved() const

The current size of the memory reserved by the memory pool.

std::size_t memoryPoolUsed() const

The current size of the memory used by the memory pool.

std::size_t memoryPoolFree() const

The current size of the memory free in the memory pool.

void memoryPoolTrimTo(std::size_t size)

Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.

Public Static Functions

static IBufferPtr gpuSync(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an IBuffer of the given size on the GPU, using cudaMalloc.

static ITensorPtr gpuSync(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.

static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an IBuffer of the given size on the CPU.

static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an ITensor of the given dimensions on the CPU.

static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned IBuffer of the given size on the CPU.

static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned ITensor of the given dimensions on the CPU.

static IBufferPtr pinnedPool(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.

static ITensorPtr pinnedPool(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.

static IBufferPtr managed(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an IBuffer of the given size in UVM.

static ITensorPtr managed(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an ITensor of the given dimensions in UVM.

Public Static Attributes

static auto constexpr kBYTE_TYPE = nvinfer1::DataType::kUINT8

Private Members

CudaStreamPtr mStream
CudaMemPoolPtr mPool
bool const mTrimPool

Friends

friend class ::BufferManagerTest

common.h

Defines

FMT_DIM
namespace tensorrt_llm
namespace runtime

Typedefs

using SizeType32 = std::int32_t
using SizeType64 = std::int64_t
using TokenIdType = std::int32_t
using LoraTaskIdType = std::uint64_t
using TokenExtraIdType = std::uint64_t
using VecTokenExtraIds = std::vector<TokenExtraIdType>
using VecUniqueTokens = std::vector<UniqueToken>
template<typename T>
using StringPtrMap = std::unordered_map<std::string, std::shared_ptr<T>>

Enums

enum class RequestType : std::int32_t

Values:

enumerator kCONTEXT
enumerator kGENERATION
struct UniqueToken

Public Functions

inline bool operator==(UniqueToken const &other) const noexcept

Public Members

TokenIdType tokenId
TokenExtraIdType tokenExtraId

cudaEvent.h

namespace tensorrt_llm
namespace runtime
class CudaEvent

Public Types

using pointer = cudaEvent_t

Public Functions

inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)

Creates a new cuda event. The event will be destroyed in the destructor.

Parameters:

flags – Flags for event creation. By default, event timing is disabled.

inline explicit CudaEvent(pointer event, bool ownsEvent = true)

Pass an existing cuda event to this object.

Parameters:
  • event – The event to pass to this object.

  • ownsEvent – Whether this object owns the event and destroys it in the destructor.

inline pointer get() const

Returns the event associated with this object.

inline void synchronize() const

Synchronizes the event.

Private Types

using element_type = std::remove_pointer_t<pointer>
using EventPtr = std::unique_ptr<element_type, Deleter>

Private Members

EventPtr mEvent
class Deleter

Public Functions

inline explicit Deleter(bool ownsEvent)
inline explicit Deleter()
inline constexpr void operator()(pointer event) const

Private Members

bool mOwnsEvent

cudaStream.h

namespace tensorrt_llm
namespace runtime
class CudaStream

Public Functions

inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)

Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.

Parameters:
  • flags – Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed.

  • priority – Priority of the stream. Lower numbers represent higher priorities. See ::cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.

inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)

Pass an existing cuda stream to this object.

Parameters:
  • stream – The stream to pass to this object.

  • device – The device on which the stream was created.

  • ownsStream – Whether this object owns the stream and destroys it in the destructor.

inline explicit CudaStream(cudaStream_t stream)

Construct with an existing cuda stream or the default stream by passing nullptr.

inline int getDevice() const

Returns the device on which the stream was created.

inline cudaStream_t get() const

Returns the stream associated with this object.

inline void synchronize() const

Synchronizes the stream.

inline void record(CudaEvent::pointer event) const

Record an event on the stream.

inline void record(CudaEvent const &event) const

Record an event on the stream.

inline void wait(CudaEvent::pointer event) const

Wait for an event.

inline void wait(CudaEvent const &event) const

Wait for an event.

Private Types

using StreamPtr = std::unique_ptr<std::remove_pointer_t<cudaStream_t>, Deleter>

Private Members

StreamPtr mStream
int mDevice = {-1}

Friends

friend class CudaStreamBindings
class Deleter

Public Functions

inline explicit Deleter(bool ownsStream)
inline explicit Deleter()
inline constexpr void operator()(cudaStream_t stream) const

Private Members

bool mOwnsStream

decodingInput.h

namespace tensorrt_llm
namespace runtime
class DecodingInput
#include <decodingInput.h>

Represents the inputs to the decoder.

This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such.

Public Types

using TensorConstPtr = ITensor::SharedConstPtr
using TensorPtr = ITensor::SharedPtr

Public Functions

inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize, TensorConstPtr logits, TensorPtr endIds, TensorConstPtr batchSlots)

Public Members

SizeType32 step

The index of the decoding step we are on. Only used in Python runtime.

SizeType32 maxLength

The maximum number of tokens to decode.

SizeType32 maxAttentionWindow

The maximum length of the attention window to consider while decoding.

SizeType32 sinkTokenLength

the number of tokens to use as attention sinks, as described there: https://arxiv.org/html/2309.17453v3

SizeType32 batchSize

The number of samples in the batch.

SizeType32 maxStopWordsLen

The maximum value in the stopWordsLens tensor.

SizeType32 maxBadWordsLen

The maximum value in the badWordsLens tensor.

TensorConstPtr logits

[batchSize, beamWidth, vocabSizePadded], on gpu. Logits are are a probability distribution over the vocabulary, the output of the model.

TensorConstPtr endIds

[batchSize * beamWidth], on gpu

TensorConstPtr batchSlots

[batchSize], address map of the linear batch id to to the seq slots, int32_t, pinned

TensorConstPtr finishReasons

[batchSize, beamWidth], finished states at current iteration. If true for some request, the decoding step of it is skipped, on gpu

TensorConstPtr sequenceLimitLength

[batchSize], on gpu. The maximum sequence length for each sequence in the batch.

TensorConstPtr embeddingBias

[batchSize, vocabSizePadded], on gpu

TensorConstPtr lengths

[batchSize, beamWidth], on gpu

std::vector<TensorPtr> badWordsLists
TensorConstPtr badWordsPtrs

[batchSize][2, badWordsLength], on gpu

TensorConstPtr badWordsLens

[batchSize], on gpu

std::vector<TensorPtr> stopWordsLists
TensorConstPtr stopWordsPtrs

[batchSize][2, stopWordsLength], pinned

TensorConstPtr stopWordsLens

[batchSize], pinned

TensorConstPtr noRepeatNgramSize

[batchSize], on gpu

TensorPtr cacheIndirection

[batchSize, beamWidth, maxSeqLen] - the k/v cache index for beam search, on gpu

std::optional<MedusaInputs> medusaInputs
std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs
std::optional<LookaheadInputs> lookaheadInputs
std::optional<ExternalDraftTokensInputs> externalDraftTokensInputs
std::optional<EagleInputs> eagleInputs
struct EagleInputs

Public Functions

inline EagleInputs(TensorConstPtr nextDraftTokens, TensorConstPtr nextDraftLens, TensorConstPtr nextDraftPaths, TensorConstPtr lastDraftTokens, TensorConstPtr lastDraftLens, TensorConstPtr lastDraftPaths, TensorConstPtr acceptedTokens, TensorConstPtr acceptedLens, TensorConstPtr acceptedPathIds, TensorConstPtr chunkedContextNextTokens, TensorConstPtr seqSlots)

Public Members

TensorConstPtr nextDraftTokens

[batchSize, maxDecodingDraftTokens]

TensorConstPtr nextDraftLens

[batchSize]

TensorConstPtr nextDraftPaths

[batchSize, maxDecodingTokens, maxPathLen]

TensorConstPtr lastDraftTokens

[batchSize, maxNumPaths, maxPathLen]

TensorConstPtr lastDraftLens

[batchSize]

TensorConstPtr lastDraftPaths

[batchSize, maxDecodingTokens, maxPathLen]

TensorConstPtr acceptedTokens

[batchSize, maxPathLen]

TensorConstPtr acceptedLens

[batchSize]

TensorConstPtr acceptedPathIds

[batchSize]

TensorConstPtr chunkedContextNextTokens

[batchSize]

TensorConstPtr seqSlots

[batchSize]

class ExplicitDraftTokensInputs

Public Members

TensorConstPtr nextDraftTokens

[batchSize, maxNumPaths, maxPathLen]

TensorConstPtr nextFlatTokens

[batchSize * maxDecodingTokens]

TensorConstPtr nextDraftIndices

[batchSize, maxNumPaths, maxPathLen]

TensorConstPtr nextDraftProbs

[batchSize, maxNumPaths, maxDraftPathLen, vocabSize]

TensorConstPtr lastDraftTokens

[batchSize, maxNumPaths, maxPathLen]

TensorConstPtr lastDraftIndices

[batchSize, maxNumPaths, maxPathLen]

TensorConstPtr masks

[batchSize, maxDecodingTokens, maxDecodingTokens], bool

TensorConstPtr packedPositionIds

[batchSize * maxDecodingTokens]

TensorConstPtr bestPathLengths

[batchSize]

TensorConstPtr bestPathIndices

[batchSize]

TensorConstPtr nextGenerationLengths

[batchSize]

TensorConstPtr lastPositionIdsBase

[batchSize]

TensorConstPtr lastGenerationLengths

[batchSize]

TensorConstPtr maxGenLengthDevice

[1]

TensorConstPtr seqSlots

[batchSize]

class ExternalDraftTokensInputs

Public Members

TensorPtr draftLogits
TensorPtr draftProbs
TensorPtr targetProbs
TensorPtr numDraftTokens
TensorPtr draftTokenIds
TensorPtr useDraftLogits
TensorPtr useDraftLogitsHost
SizeType32 step
float constantThreshold
bool useRandomAcceptanceThreshold
struct LookaheadInputs

Public Members

TensorPtr tokensPerStep
class MedusaInputs

Public Members

TensorConstPtr medusaPaths

[batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu

TensorConstPtr medusaTreeIds

[batchSize, maxTokensPerStep], on gpu

std::vector<std::vector<TensorPtr>> medusaLogits

[batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu

TensorPtr medusaCurTokensPerStep

[batchSize], on gpu

TensorConstPtr medusaTargetTokensPerStep

[batchSize], on gpu

decodingOutput.h

namespace tensorrt_llm
namespace batch_manager
namespace runtime
class DecodingOutput

Public Types

using TensorPtr = ITensor::SharedPtr

Public Functions

inline explicit DecodingOutput(TensorPtr ids, TensorPtr gatheredIds)

Public Members

TensorPtr ids
TensorPtr gatheredIds
TensorPtr newTokensSteps
TensorPtr newTokens
std::vector<TensorPtr> newTokensVec
TensorPtr finishReasons
TensorPtr finishedSum
TensorPtr logProbs
TensorPtr cumLogProbs
TensorPtr parentIds
TensorPtr lengths
TensorPtr cacheIndirection
TensorPtr logProbsTiled
BeamHypotheses beamHypotheses
std::optional<SpeculativeDecodingOutputs> speculativeDecodingOutputs
std::optional<ExplicitDraftTokensBuffers::Inputs> explicitDraftTokensBuffers
std::optional<LookaheadDecodingBuffers> lookaheadOutputs
std::optional<EagleBuffers::Inputs> eagleBuffers

Public Static Attributes

static float constexpr kNegativeInfinity = -1e20f
class BeamHypotheses

Public Functions

void empty(BufferManager &manager)
void reshape(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSequenceLength)
void release()
void init(BufferManager &manager, TokenIdType endId)
BeamHypotheses slice(SizeType32 batchIndex, SizeType32 size) const

Public Members

TensorPtr outputIdsCBA
TensorPtr logProbsCBA
TensorPtr sequenceLengthsCBA
TensorPtr cumLogProbsCBA
TensorPtr normedScoresCBA
TensorPtr numBeamsCBA
TensorPtr minNormedScoresCBA
TensorPtr batchDones
class SpeculativeDecodingOutputs

Public Members

TensorPtr nextDraftTokens
TensorPtr nextDraftTokensLen
TensorPtr prevDraftTokensLen
TensorPtr acceptedTokensLen
TensorPtr acceptedLengthsCumSum
TensorPtr pathsOffsets

eagleBuffers.h

namespace tensorrt_llm
namespace batch_manager
namespace runtime
class EagleBuffers

Public Types

using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>
using RequestVector = std::vector<LlmRequestPtr>
using SizeType32 = runtime::SizeType32
using ITensor = runtime::ITensor
using BufferPtr = runtime::IBuffer::SharedPtr
using TensorPtr = runtime::ITensor::SharedPtr
using TensorMap = runtime::StringPtrMap<runtime::ITensor>

Public Functions

EagleBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, executor::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const &modelConfig)
void setFromInputs(RequestVector const &contextRequests, RequestVector const &genRequests, runtime::ITensor const &requestTypes, ITensor const &seqSlots, EagleBuffers::Inputs const &decoderBuffers, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const
void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const

Public Members

Inputs engineInputs
class tensorrt_llm::runtime::EagleBuffers::EngineOutputs engineOutputs

Private Functions

template<typename T>
void setFromInputs(RequestVector const &contextRequests, RequestVector const &genRequests, SizeType32 vocabSizePadded, ITensor const &seqSlots, EagleBuffers::Inputs const &draftBuffers, runtime::EagleModule const &eagleModule, runtime::BufferManager const &manager) const

Private Members

std::size_t scanTempStorageBytes = {0}
std::size_t reduceTempStorageBytes = {0}
float mDefaultPosteriorThreshold = {0.09f}
bool mDoGreedySampling = {true}
BufferPtr scanReduceTempStorage
TensorPtr cumSumGenerationLengths
TensorPtr maxGenerationLength
TensorPtr chunkedContextNextTokensHost
TensorPtr greedySamplingHost
TensorPtr posteriorAlphaHost
TensorPtr posteriorThresholdHost
class EngineOutputs

Public Members

TensorPtr nextDraftTokens

[batchSize, maxDecodingDraftTokens]

TensorPtr nextDraftLens

[batchSize]

TensorPtr nextDraftPaths

[batchSize, maxNumPaths, maxPathLen]

TensorPtr acceptedTokens

[batchSize, maxPathLen]

TensorPtr acceptedLens

[batchSize]

TensorPtr acceptedPaths

[batchSize]

TensorPtr chunkedContextNextTokens

[batchSize]

class Inputs

Public Functions

void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)

Public Members

TensorPtr temperatures

[maxBatchSize] or [numSequences]

TensorPtr posteriorAlpha

[maxBatchSize] or [numSequences]

TensorPtr posteriorThreshold

[maxBatchSize] or [numSequences]

TensorPtr randomDataSample

[maxBatchSize] or [numSequences]

TensorPtr randomDataValidation

[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]

TensorPtr draftTokens

[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]

TensorPtr draftLens

[maxBatchSize] or [numSequences]

TensorPtr draftPaths

[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]

TensorPtr specDecodingGenerationLengths

[maxBatchSize] or [numGenSequences]

TensorPtr specDecodingGenerationLengthsHost

[maxBatchSize] or [numGenSequences]

TensorPtr specDecodingPackedMasks

[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]

TensorPtr specDecodingPositionOffsets

[maxBatchSize] or [numGenSequences]

TensorPtr eagleNetCtxRequestTypesHost

[maxBatchSize] or [numSequences]

TensorPtr eagleNetCtxContextLengthsHost

[maxBatchSize] or [numSequences]

TensorPtr eagleNetCtxPastKeyValueLengthsHost

[maxBatchSize] or [numSequences]

TensorPtr eagleNetGenRequestTypesHost

[maxBatchSize] or [numSequences]

TensorPtr eagleNetGenContextLengthsHost

[maxBatchSize] or [numSequences]

TensorPtr eagleNetGenPastKeyValueLengthsHost

[maxBatchSize] or [numSequences]

TensorPtr inputGenTokensHost

[maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens]

TensorPtr chunkedContextNextTokens

[maxBatchSize] or [numSequences]

TensorPtr useDynamicTreeHost

[1]

explicitDraftTokensBuffers.h

namespace tensorrt_llm
namespace runtime
class ExplicitDraftTokensBuffers

Public Types

using SizeType32 = runtime::SizeType32
using ITensor = runtime::ITensor
using BufferPtr = runtime::IBuffer::SharedPtr
using TensorPtr = runtime::ITensor::SharedPtr
using TensorMap = runtime::StringPtrMap<runtime::ITensor>

Public Functions

ExplicitDraftTokensBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, executor::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const &modelConfig)
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const &requestTypes, ITensor const &seqSlots, ExplicitDraftTokensBuffers::Inputs const &decoderBuffers, ITensor const &contextPositionIds, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const
void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const

Public Members

tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs engineInputs
class tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs engineOutputs
std::size_t scanTempStorageBytes = {0}
BufferPtr scanTempStorage
TensorPtr cumSumGenerationLengths

Private Functions

template<typename T>
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 vocabSizePadded, ITensor const &seqSlots, ExplicitDraftTokensBuffers::Inputs const &draftBuffers, ITensor const &contextPositionIds, runtime::ExplicitDraftTokensModule const &explicitDraftTokensModule, runtime::CudaStream const &stream) const
class EngineInputs : public tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs

Public Members

TensorPtr requestTypesDevice

[numSequences], on gpu

TensorPtr positionOffsets

[numGenSequences]

class EngineOutputs

Public Members

TensorPtr nextGenerationLengths

[batchSize]

TensorPtr nextPositionOffsets

[batchSize]

TensorPtr masks

[batchSize, maxDecodingTokens, maxDecodingTokens], bool

TensorPtr nextDraftTokens

[batchSize, maxNumPaths, maxPathLen]

TensorPtr nextDraftIndices

[batchSize, maxNumPaths, maxPathLen]

TensorPtr nextDraftProbs

[batchSize, maxNumPaths, maxDraftPathLen, vocabSize]

TensorPtr nextFlatTokens

[batchSize * maxDecodingTokens]

TensorPtr bestPathLengths

[batchSize]

TensorPtr bestPathIndices

[batchSize]

TensorPtr maxGenToken

[1]

TensorPtr totalGenToken

[1]

TensorPtr packedPositionIds

[batchSize * maxDecodingTokens]

class Inputs

Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs

Public Functions

void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)

Public Members

TensorPtr temperatures

[maxBatchSize]

TensorPtr positionIdsBase

[maxBatchSize]

TensorPtr generationLengths

[maxBatchSize] or [numGenSequences]

TensorPtr randomDataSample

[maxBatchSize]

TensorPtr randomDataValidation

[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]

TensorPtr draftTokens

[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]

TensorPtr draftIndices

[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]

TensorPtr draftProbs

[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]

TensorPtr packedMasks

[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]

TensorPtr positionIds

[maxBatchSize] or [numGenSequences]

TensorPtr maxGenLengthHost
TensorPtr generationLengthsHost

generationInput.h

namespace tensorrt_llm
namespace runtime
class GenerationInput : public tensorrt_llm::runtime::GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>

Public Types

using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
using TensorPtr = Base::TensorPtr

Public Functions

inline explicit GenerationInput(SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)
template<typename TTensor, typename PromptTuningParams>
class GenericGenerationInput
#include <generationInput.h>

  • endId, is the token ID that marks the end of the input sequence (aka EOS or end-of-sequence). It’s 50,256 for the GPT2 model which has a vocabulary of 50,257 tokens, for example,

  • padId, is the token ID that is used for padding (i.e. fills in the slots that are at an index greater-or-equal to the input length for padded sequences). It can be set to the same value as endId,

  • ids, is the tensor of input IDs. That tensor must be allocated on the GPU. When the input tensor is padded, the shape of ids is [batchSize, maxInputLength], where batchSize and maxInputLength must respect the maximum sizes in sessionConfig passed to the GptSession constructor. When the input is packed, the shape of ids is [numTokens], where numTokens is the sum of the lengths of the different sequences in the batch,

  • lengths, is the tensor of input sequence lengths. That tensor must be allocated on the GPU and contain batchSize values,

  • packed, indicates if the ids tensor is packed or padded. In this release, that flag must match the value passed to the constructor through the instance of the ModelConfig class. In a future release, the session may be made more flexible and automatically pad or pack the input,

  • embeddingBiasOpt, is a tensor of floating-point values on the GPU that contains the bias to add to the logits during sampling (after the projection from hidden states to logits as the last step of the model). This tensor must have vocabSize elements (as defined in the modelConfig argument passed to the constructor),

  • badWordsList, is a tensor of integers on the GPU that encodes the list of words that have to be banned from generated sequences. Its shape is [2, badWordsLength], as explained below, or [batchSize, 2, badWordsLength] when there is a different list for each sequence in the batch,

  • stopWordsList, is a tensor of integers on the GPU that encodes the list of words that trigger the end of the generation for a sequence. Its shape is [2, stopWordsLength], as explained below, or [batchSize, 2, stopWordsLength] when there is a different list for each sequence in the batch,

  • maxNewTokens, is the maximum number of tokens to generate.

The badWordsList and stopWordsList tensors have the same shape [2, length]. Let’s consider an example with three words to describe the representation of those lists. The first word contains tokens [5, 7, 3], the second one contains [9, 2] and the third one is composed of tokens [6, 2, 4, 1]. In total, there are 9 tokens. That’s the length. The shape of the tensor is [2, 9]. The first row of the tensor must contain the 9 token IDs and the second row must store the inclusive prefix-sum of the word lengths as shown on the following diagram:

   0           3       5              9
   |           |       |              |
   V           V       V              V
[  5,  7,  3,  9,  2,  6,  2,  4,  1]
[  3,  5,  9, -1, -1, -1, -1, -1, -1]

In case all the words are made of a single token, the inner-most dimension of the tensor must be increased by 1 (i.e. the length for 4 words, each made of a single token, must be 5 instead of 4 &#8212; the shape is [2, 5]).

Public Types

using TensorPtr = TTensor

Public Functions

inline explicit GenericGenerationInput(SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

Public Members

SizeType32 endId
SizeType32 padId
TensorPtr ids
TensorPtr lengths
bool packed
TensorPtr embeddingBias
TensorPtr badWordsList
TensorPtr stopWordsList
std::optional<SizeType32> maxNewTokens
PromptTuningParams promptTuningParams

generationOutput.h

namespace tensorrt_llm
namespace runtime
class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<ITensor::SharedPtr>

Public Types

using Base = GenericGenerationOutput<ITensor::SharedPtr>
using TensorPtr = Base::TensorPtr

Public Functions

inline explicit GenerationOutput(TensorPtr ids, TensorPtr lengths)
template<typename TTensor>
class GenericGenerationOutput
#include <generationOutput.h>

  • ids, is a tensor that contains the output token IDs. Its shape is [batchSize, beamWidth, maxSeqLength] where maxSeqLength is the sum of maxInputLength and maxNewTokens. After generation, it contains, for each sequence, a copy of the input tokens followed by the output tokens. When a sequence is shorter than maxSeqLength, padding tokens are added at the end of the sequence.

Note that the shape of that tensor is different in this version of TensorRT-LLM from its shape in previous versions where it was .

  • logProbs, is a tensor of floating-point values on the GPU to store the log-prob of the generated tokens. Its shape is [maxNewTokens, batchSize, beamWidth]. Its shape will likely change in a future release to match the shape of the output ids tensor.

  • contextLogits, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the context. Its shape is [batchSize, maxSequenceLength, vocabSizePadded]. If use remove_input_padding, its shape is [packedSize, vocabSizePadded]. This buffer will only be filled in if the TensorRT engine was built with the gather_context_logits or gather_all_token_logits parameter enabled.

    After inference is complete, you can get the context logits in GenerationOutput.contextLogits, these are variables on the GPU. For specific acquisition methods, please refer to the example of gptSessionBenchmark.cpp.

    It is important to point out that enabling the computation may have an impact on performance (the language modeling head (LM head) has to perform a matrix multiplication on all the context tokens instead of a just the last one).

  • generationLogits, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the generation. Its shape is [batchSize, beamWidth, maxOutputLen, vocabSizePadded]. This buffer will only be filled in if the TensorRT engine was built with the gather_generation_logits or gather_all_token_logits parameter enabled.

    Generation logits can also be obtained through GenerationOutput.generationLogits after inference is completed.

  • onTokenGenerated, is a callback function invoked in the generation loop to pass newly generated tokens to the caller while the loop continues to execute. An implementation of that callback must accept the output ids tensor, the generation step and a boolean flag that indicates if the generation is complete.

Public Types

using TensorPtr = TTensor
using Callback = std::function<void(TensorPtr const &ids, SizeType32 step, bool finished)>

Public Functions

inline explicit GenericGenerationOutput(TensorPtr ids, TensorPtr lengths)

Public Members

TensorPtr ids
TensorPtr lengths
TensorPtr cumLogProbs
TensorPtr logProbs
TensorPtr contextLogits
TensorPtr generationLogits
Callback onTokenGenerated

gptDecoder.h

namespace tensorrt_llm
namespace layers
namespace runtime

Functions

inline runtime::ITensor::SharedConstPtr getDefaultBatchSlots(runtime::SizeType32 batchSize)

Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.

template<typename T>
class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder

Public Types

using CudaStreamPtr = BufferManager::CudaStreamPtr
using TensorPtr = std::shared_ptr<ITensor>

Public Functions

GptDecoder(executor::DecodingMode const &mode, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength, CudaStreamPtr const &stream, std::shared_ptr<SpeculativeDecodingModule const> speculativeDecodingModule = nullptr)
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, TensorConstPtr const &batchSlots, std::optional<DecodingOutput> const &output = std::nullopt, std::optional<std::vector<decoder_batch::Request> const> const &requests = std::nullopt) override
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) override
virtual void forwardSync(DecodingOutput &output, DecodingInput const &input) override
inline virtual SamplingConfig const &getSamplingConfig() override

Private Members

std::shared_ptr<BufferManager> mManager
std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer
std::shared_ptr<tensorrt_llm::runtime::DecodingLayerWorkspace> mDecodingLayerWorkspace
SamplingConfig mSamplingConfig
size_t mMaxBatchSize
executor::DecodingMode mDecodingMode
class IGptDecoder

Subclassed by tensorrt_llm::runtime::GptDecoder< T >

Public Types

using TensorPtr = runtime::ITensor::SharedPtr
using TensorConstPtr = runtime::ITensor::SharedConstPtr

Public Functions

virtual ~IGptDecoder() = default
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, TensorConstPtr const &batchSlots, std::optional<DecodingOutput> const &output = std::nullopt, std::optional<std::vector<decoder_batch::Request> const> const &requests = std::nullopt) = 0
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) = 0
virtual void forwardSync(DecodingOutput &output, DecodingInput const &input) = 0
virtual SamplingConfig const &getSamplingConfig() = 0

Public Static Functions

static inline std::unique_ptr<IGptDecoder> create(executor::DecodingMode const &mode, nvinfer1::DataType dtype, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength, BufferManager::CudaStreamPtr const &stream, std::shared_ptr<SpeculativeDecodingModule const> const &speculativeDecodingModule = nullptr)

gptDecoderBatched.h

namespace tensorrt_llm
namespace runtime
class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched
#include <gptDecoderBatched.h>

GPT decoder class with support for in-flight batching.

Public Types

enum class ForwardType

Values:

enumerator kASYNC
enumerator kSYNC
using CudaStreamPtr = std::shared_ptr<CudaStream>
using TensorPtr = ITensor::SharedPtr
using SharedConstPtr = ITensor::SharedConstPtr

Public Functions

GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream, SpeculativeDecodingMode const &speculativeDecodingMode, nvinfer1::DataType dtype)
virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) override

Setup the decoder before calling forward()

virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) override

Setup buffers for ExplicitDraftTokens decoding.

virtual void setupEagle(EagleBuffers::Inputs eagleBuffers) override

Setup buffers for Eagle decoding.

virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) override

Setup buffers for Lookahead decoding.

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig) override

Initialize the decoder with new batch of inputs.

virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs, ModelConfig const &modelConfig) override

Initialize batched decoder at seqSlots with a new requests.

virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override

Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent) override

Wait for the call to forwardAsync associated with a token to complete.

virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent, decoder_batch::Output &output, decoder_batch::Input const &input) override

Call decoder forwardSync and wait for the call to forwardAsync associated with a token to complete.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override

Run one step for all requests without blocking the host thread.

virtual void forwardSync() override

Wait for the last call to forwardAsync to complete.

inline virtual std::vector<bool> getFinished() const override
Returns:

[batchSize], indicators of finished requests

inline virtual TensorPtr getFinishReasons() const override
Returns:

[batchSize, beamWidth], FinishedState value, on gpu

inline virtual TensorPtr getIds(SizeType32 batchIdx) const override
Parameters:

batchIdx – index of the batch

Returns:

[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu. In case of beam search, contains the ungathered data.

inline virtual TensorPtr getIds() const override
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data.

inline virtual TensorPtr getGatheredIds(SizeType32 batchIdx) const override
Parameters:

batchIdx – index of the batch

Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request batchIdx, on gpu.

inline virtual TensorPtr getGatheredIds() const override
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu

virtual CudaEvent finalize(SizeType32 batchSlot, SamplingConfig const &samplingConfig, bool streaming) const override

Gather final beam search results for request batchSlot. Result will only be available after event returned.

virtual void finalize(SamplingConfig const &samplingConfig) const override

Gather final beam search results for all requests.

inline virtual TensorPtr getParentIds() const override
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu

inline virtual TensorPtr getCumLogProbs() const override
Returns:

[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const override
Returns:

[maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs() const override
Returns:

[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs(SizeType32 batchIdx) const override
Returns:

[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getAllNewTokens() const override

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:

[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

inline virtual TensorPtr getNewTokens(SizeType32 iter = 0) const override

Get tokens generated in one step of last forward pass.

Parameters:

iter – The iteration within [0; maxTokensPerStep) for which to get the tokens

Returns:

[batchSize, beamWidth], tokens generated in iter (per beam), on gpu

inline virtual std::vector<SizeType32> getNbSteps() const override
Returns:

[batchSize], the number of generation steps executed on each request

inline virtual TensorPtr getNbFinished() const override
Returns:

[1], number of finished sequences, in pinned host memory

inline virtual TensorPtr getNextDraftTokens() const override
Returns:

[batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu

inline virtual TensorPtr getPrevDraftTokensLengths() const override
Returns:

[batchSize], predicted draft tokens lengths for previous step, on gpu

inline virtual TensorPtr getNextDraftTokensLengths() const override
Returns:

[batchSize], predicted draft tokens lengths for next step, on gpu

inline virtual TensorPtr getAcceptedLengthsCumSum() const override
Returns:

[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu

inline virtual TensorPtr getAcceptedPackedPaths() const override
Returns:

[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu

inline virtual executor::DecodingMode getDecodingMode() const override

Private Types

using GptDecoderPtr = std::unique_ptr<IGptDecoder>
using DecodingInputPtr = std::unique_ptr<DecodingInput>
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>

Private Functions

CudaEvent postProcessRequest(SizeType32 batchIdx, SamplingConfig const &samplingConfig, bool streaming) const

Gather final beam search results for request batchIdx.

void newRequest(SizeType32 batchSlot, decoder_batch::Request const &request, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig)

Initialize the decoder at batchSlot with a new request.

void allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype)

Allocate buffers for speculative decoding.

void setupSpeculativeDecoding(ModelConfig const &modelConfig)

Setup buffers for speculative decoding.

void setupLookahead(ModelConfig const &modelConfig)

Setup buffers for lookahead decoding.

void newRequestSpeculativeDecoding(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig)

Setups decoder internal tensors for new speculative decoding request.

void newRequestDraftTokensExternal(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig)

Setups decoder internal tensors for new request in Draft model Sps mode.

void newRequestMedusa(SizeType32 batchIdx, decoder_batch::Request const &request)

Setups decoder internal tensors for new Medusa request.

void newRequestLookahead(SizeType32 batchIdx, decoder_batch::Request const &request)

Setups decoder internal tensors for new Lookahead request.

void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::Request const &request)

Setups decoder internal tensors for new Explicit draft tokens request.

void newRequestEagle(SizeType32 batchIdx, decoder_batch::Request const &request, ModelConfig const &modelConfig)

Setups decoder internal tensors for new Eagle request.

void updateFinished(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent)

Updates finished state on host for all active requests.

void setExplicitDraftTokensInputs(decoder_batch::Input const &input)

Sets inputs for explicit draft tokens.

void setEagleInputs(decoder_batch::Input const &input)

Sets inputs for eagle decoding.

void forwardDispatch(decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType)

Calls decoders for tokens per engine step.

void forwardDecoder(SizeType32 step, decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType)

Calls decoder for whole batch.

Private Members

std::size_t const mVocabSize
std::size_t const mVocabSizePadded
CudaStreamPtr mRuntimeStream
CudaStreamPtr mDecoderStream
BufferManager mBufferManager
DecoderFinishedEventPtr mDecoderFinishEvent
CudaEvent mForwardEvent
GptDecoderPtr mDecoder
DecodingInputPtr mJointDecodingInput
DecodingOutputPtr mJointDecodingOutput
std::vector<SizeType32> mNbSteps
std::vector<bool> mFinished
TensorPtr mFinishedSum
std::vector<SizeType32> mMaxNewTokens
std::vector<SizeType32> mBeamWidths
std::vector<SizeType32> mNumDecodingEngineTokens
TensorPtr mFinishedSteps
TensorPtr mBatchSlotsSetup
TensorPtr mBatchSlotsDecoder
SizeType32 mMaxSequenceLength = {}
SizeType32 mMaxAttentionWindow = {}
SizeType32 mSinkTokenLength = {}
SizeType32 mActualBatchSize = {}
SizeType32 mMaxDecodingDecoderTokens = {}
SizeType32 mMaxDecodingEngineTokens = {}
SpeculativeDecodingMode mSpeculativeDecodingMode
executor::DecodingMode mDecodingMode = {executor::DecodingMode::Auto()}
std::shared_ptr<DecodingOutput::BeamHypotheses> mOutputBeamHypotheses = {nullptr}
DecodingOutput::TensorPtr mCumLogProbsTmp
SizeType32 mNumSMs

gptJsonConfig.h

namespace tensorrt_llm
namespace runtime
class GptJsonConfig

Public Functions

inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism, SizeType32 gpusPerNode, ModelConfig modelConfig, std::optional<RuntimeDefaults> runtimeDefaults = std::nullopt)
inline ModelConfig const &getModelConfig() const
inline ModelConfig &getModelConfigMutable()
inline std::string const &getName() const
inline std::string const &getVersion() const
inline std::string const &getPrecision() const
inline SizeType32 constexpr getTensorParallelism() const
inline SizeType32 constexpr getPipelineParallelism() const
inline SizeType32 constexpr getContextParallelism() const
inline SizeType32 constexpr getGpusPerNode() const
inline SizeType32 constexpr getWorldSize() const
inline std::optional<RuntimeDefaults> getRuntimeDefaults() const
std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const
inline std::string engineFilename(WorldConfig const &worldConfig) const

Public Static Functions

static GptJsonConfig parse(std::string const &json)
static GptJsonConfig parse(std::istream &json)
static GptJsonConfig parse(std::filesystem::path const &path)

Private Members

std::string const mName
std::string const mVersion
std::string const mPrecision
SizeType32 const mTensorParallelism
SizeType32 const mPipelineParallelism
SizeType32 const mContextParallelism
SizeType32 const mGpusPerNode
ModelConfig mModelConfig
std::optional<RuntimeDefaults> mRuntimeDefaults

gptSession.h

namespace tensorrt_llm
namespace batch_manager
namespace kv_cache_manager
namespace runtime
class GptSession

Public Types

using LoggerPtr = std::shared_ptr<nvinfer1::ILogger>

Public Functions

GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, RawEngine const &rawEngine, LoggerPtr logger = nullptr)
Parameters:
  • sessionConfig – Configuration of the session,

  • modelConfig – Description of the model,

  • worldConfig – Description of the environment,

  • rawEngine – The compiled TensorRT engine,

  • logger – The optional logger.

inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)
nvinfer1::ILogger &getLogger() const
BufferManager const &getBufferManager() const
BufferManager::CudaStreamPtr getRuntimeStreamPtr() const
inline ModelConfig const &getModelConfig() const
inline WorldConfig const &getWorldConfig() const
inline int getDevice() const noexcept
inline bool getNormalizeLogProbs() const noexcept
nvinfer1::IEngineInspector &getEngineInspector() const
nvinfer1::DataType getLogitDataType() const
void generate(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig, std::shared_ptr<GenerationProfiler> const generationProfiler = nullptr)

This function performs the generation loop.

Given input tensors to read from, output tensors to populate, that member function can be produced or each sequence has reached completion (due to the production will run the generation loop until it reaches the maximum number of tokens that of “end-of-sequence” or a word in the list of “stop words”). The pseudo-code of that function looks like (member function names were changed to keep the presentation simple):

// Have all the sequences in the batch reached completion?
bool allFinished = false;

// Until all sequences are finished or the number of steps reaches the limit...
for (int step = 0; !allFinished && step < maxNewTokens; ++step) {

// Trigger the computation of the logits...
computeLogits(...);

// Run the sampling to produce a token (for each active sequence) from the logits.
allFinished = generateTokensFromLogits(...);

// Callback to stream the output tokens while the generation loop continues.
onTokenGenerated(...);
}
void setLayerProfiler()

Set LayerProfiler to collect performance per layer.

std::string getLayerProfileInfo() const

Print profile information per layer.

Private Types

using BaseKVCacheManager = batch_manager::kv_cache_manager::BaseKVCacheManager
using KvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig
using TensorPtr = runtime::ITensor::SharedPtr
using TokenGeneratedCallback = std::function<void(SizeType32 step, bool finished)>

Private Functions

inline bool useCudaGraphs()
void generateBatched(std::vector<GenerationOutput> &microBatchesOutputs, std::vector<GenerationInput> const &microBatchesInputs, SamplingConfig const &samplingConfig, TokenGeneratedCallback const &onTokenGenerated, std::shared_ptr<GenerationProfiler> const generationProfiler)
void setup(Config const &sessionConfig)
void createContexts()
void createBuffers(SizeType32 numMicroBatches)
void createDecoders(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType32 numMicroBatches, executor::DecodingMode const &decodingMode)
void createKvCacheManager(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, KvCacheConfig const &config)
void createCustomAllReduceWorkspace(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSequenceLength)
void executeContextStep(std::vector<GenerationInput> const &generationBatchesInputs, std::vector<SizeType32> const &generationBatchesOffsets, BaseKVCacheManager const *kvCacheManager)
SizeType32 executeGenerationStep(SizeType32 step, std::vector<GenerationInput> const &microBatchesInputs, std::vector<GenerationOutput> &microBatchesOutputs, std::vector<SizeType32> const &microBatchOffsets, BaseKVCacheManager *kvCacheManager, std::vector<bool> &microBatchesFinished)
void decoderStepAsync(SizeType32 decoderStep, SizeType32 microBatchId)

Execute decoder on last PP rank, receive decoder output on other PP ranks.

bool shouldStopSync(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 microBatchId)

Synchronize with the decoder and return the shouldStop flag.

void finalize(SizeType32 microBatchId, SamplingConfig const &samplingConfig)

Collect final output ids and log probs on last PP rank and send them to first PP rank.

Receives are asynchronous on host, so synchronization is required before access.

void kvCacheAddSequences(SizeType32 beamWidth, SizeType32 microBatchId, SizeType32 firstBatchIdx)
ITensor::SharedPtr initDecoder(ITensor &outputIds, GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, SizeType32 microBatchId) const

Populate outputIds and return reference to newTokens tensor.

TokenGeneratedCallback createOnTokenGeneratedCallback(GenerationOutput &outputs)
bool shouldUseKVCacheManager() const

Private Members

ModelConfig const mModelConfig
WorldConfig const mWorldConfig
int mDevice = {-1}
std::shared_ptr<NcclCommunicator> mPipelineComm
std::shared_ptr<CudaStream> mCommStream
CudaEvent mCommEvent = {}
std::shared_ptr<AllReduceBuffers> mAllReduceBuffers
SizeType32 mDecoderMaxSequenceLength = {}
std::vector<SizeType32> mDecoderMaxAttentionWindowVec = {}
SizeType32 mDecoderMaxAttentionWindow = {}
SizeType32 mDecoderSinkTokenLength = {}
LoggerPtr mLogger
std::shared_ptr<TllmRuntime> mRuntime
std::shared_ptr<BaseKVCacheManager> mKvCacheManager
MicroBatchConfig mMicroBatchConfig
std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders
std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers
std::vector<CudaEvent> mReceivedEvents
bool mCudaGraphMode = {false}
std::vector<CudaGraphExecutor> mCudaGraphInstances
bool mNormalizeLogProbs = true

Friends

friend class batch_manager::TrtGptModelV1
class Config
#include <gptSession.h>

Configuration for session execution and buffer sizes. generate may be called with batch size and beam width smaller than the configured parameters.

maxBatchSize will be divided by the number of micro batches to initialize each batch buffer.

Public Functions

inline Config(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, float gpuWeightsPercent = 1.0)

Public Members

SizeType32 maxBatchSize
SizeType32 maxBeamWidth
SizeType32 maxSequenceLength
float gpuWeightsPercent
bool decoderPerRequest = {false}
bool cudaGraphMode = {false}
KvCacheConfig kvCacheConfig = {}
std::optional<SizeType32> ctxMicroBatchSize = std::nullopt
std::optional<SizeType32> genMicroBatchSize = std::nullopt
std::optional<executor::DecodingMode> decodingMode = std::nullopt
bool normalizeLogProbs = true
class CudaGraphExecutor

Public Functions

CudaGraphExecutor() = default
inline ~CudaGraphExecutor()
inline bool hasInstance()
void clear()
void prepareNextGraph(TllmRuntime const &runtime, SizeType32 nextContextId)
void launch(CudaStream const &stream)

Private Functions

void create(cudaGraph_t const &graph)
bool update(cudaGraph_t const &graph)
void uploadToStream(CudaStream const &stream)

Private Members

cudaGraphExec_t mInstance
class GenerationProfiler
#include <gptSession.h>

Optional profiler class to profile the generation phase of an inference request.

Public Functions

inline GenerationProfiler()
inline CudaEvent const &getStart() const
inline CudaEvent const &getEnd() const
inline float getElapsedTimeMs()

Public Static Attributes

static constexpr unsigned int flags = {cudaEventDefault}

Private Members

CudaEvent start
CudaEvent end
class MicroBatchConfig

Public Functions

inline MicroBatchConfig()
explicit MicroBatchConfig(SizeType32 maxBatchSize, SizeType32 pipelineParallelism, std::optional<SizeType32> genMicroBatchSize, std::optional<SizeType32> ctxMicroBatchSize)
inline constexpr SizeType32 numCtxPerGen() const
inline constexpr SizeType32 getGenGraphId(SizeType32 flipFlopId, SizeType32 generationBatchId) const

flip-flop between 2 graph instances for each generation batch.

Public Members

SizeType32 numCtxBatches
SizeType32 numGenBatches
SizeType32 ctxBatchSize
SizeType32 genBatchSize
namespace utils

Functions

std::vector<uint8_t> loadEngine(std::string const &enginePath)

iBuffer.h

template<>
struct MemoryTypeString<MemoryType::kGPU>

Public Static Attributes

static auto constexpr value = "GPU"
template<>
struct MemoryTypeString<MemoryType::kCPU>

Public Static Attributes

static auto constexpr value = "CPU"
template<>
struct MemoryTypeString<MemoryType::kPINNED>

Public Static Attributes

static auto constexpr value = "PINNED"
template<>
struct MemoryTypeString<MemoryType::kUVM>

Public Static Attributes

static auto constexpr value = "UVM"
template<>
struct MemoryTypeString<MemoryType::kPINNEDPOOL>

Public Static Attributes

static auto constexpr value = "PINNEDPOOL"
template<>
struct DataTypeTraits<nvinfer1::DataType::kFLOAT>

Public Types

using type = float

Public Static Attributes

static char constexpr name[] = "float"
static auto constexpr size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kHALF>

Public Types

using type = half

Public Static Attributes

static char constexpr name[] = "half"
static auto constexpr size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT8>

Public Types

using type = std::int8_t

Public Static Attributes

static char constexpr name[] = "int8"
static auto constexpr size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32>

Public Types

using type = std::int32_t

Public Static Attributes

static char constexpr name[] = "int32"
static auto constexpr size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64>

Public Types

using type = std::int64_t

Public Static Attributes

static char constexpr name[] = "int64"
static auto constexpr size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32, true>

Public Types

using type = std::uint32_t

Public Static Attributes

static char constexpr name[] = "uint32"
static auto constexpr size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64, true>

Public Types

using type = std::uint64_t

Public Static Attributes

static char constexpr name[] = "uint64"
static auto constexpr size = sizeof(type)
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned>

Public Types

using type = bool

Public Static Attributes

static char constexpr name[] = "bool"
static auto constexpr size = sizeof(type)
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned>

Public Types

using type = std::uint8_t

Public Static Attributes

static char constexpr name[] = "uint8"
static auto constexpr size = sizeof(type)
template<>
struct TRTDataType<std::int8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8
template<>
struct TRTDataType<std::int32_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32
template<>
struct TRTDataType<std::uint32_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
template<>
struct TRTDataType<std::int64_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64
template<>
struct TRTDataType<std::uint64_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
template<>
struct TRTDataType<std::uint8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8
template<>
struct TRTDataType<kernels::KVCacheIndex>

Public Static Attributes

static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value
template<>
struct TRTDataType<kernels::FinishedState>

Public Static Attributes

static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value
template<>
struct TRTDataType<runtime::RequestType>

Public Static Attributes

static constexpr auto value = TRTDataType<std::underlying_type_t<runtime::RequestType>>::value
namespace tensorrt_llm
namespace runtime

Typedefs

template<typename T>
using PointerElementType = typename std::remove_reference_t<T>::element_type

Enums

enum class MemoryType : std::int32_t

Values:

enumerator kGPU
enumerator kCPU
enumerator kPINNED
enumerator kUVM
enumerator kPINNEDPOOL

Functions

template<typename T>
std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::shared_ptr<T> const &ptr) noexcept
template<typename T, typename D>
std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::unique_ptr<T, D> &&ptr) noexcept
template<typename T>
T const *bufferCast(IBuffer const &buffer)

Gets a typed pointer to the constant underlying data of the buffer.

Template Parameters:

T – The type of the underlying data.

Parameters:

buffer – The buffer to get a pointer to.

Returns:

A pointer to constant T.

template<typename T>
T *bufferCast(IBuffer &buffer)

Gets a typed pointer to the underlying data of the buffer.

Template Parameters:

T – The type of the underlying data.

Parameters:

buffer – The buffer to get a pointer to.

Returns:

A pointer to T.

template<typename T>
T *bufferCastOrNull(IBuffer::SharedPtr const &bufferPtr)

Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.

Template Parameters:

T – The type of the underlying data.

Parameters:

bufferPtr – A possibly null shared ptr.

Returns:

A pointer to T, possibly nullptr.

template<typename T>
T const *bufferCastOrNull(IBuffer::SharedConstPtr const &bufferPtr)

Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.

Template Parameters:

T – The type of the underlying data.

Parameters:

bufferPtr – A possibly null shared ptr.

Returns:

A pointer to const T, possibly nullptr.

template<typename T>
T *bufferCastOrNull(std::optional<IBuffer::SharedPtr> const &optionalBufferPtr)

Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.

Template Parameters:

T – The type of the underlying data.

Parameters:

optionalBufferPtr – A possibly empty optional.

Returns:

A pointer to T, possibly nullptr.

template<typename T>
T const *bufferCastOrNull(std::optional<IBuffer::SharedConstPtr> const &optionalBufferPtr)

Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.

Template Parameters:

T – The type of the underlying data.

Parameters:

optionalBufferPtr – A possibly empty optional.

Returns:

A pointer to const T, possibly nullptr.

std::ostream &operator<<(std::ostream &output, IBuffer const &buffer)

Utility function to print a buffer.

class BufferDataType
#include <iBuffer.h>

A wrapper around nvinfer1::DataType that provides a support for pointer types.

Public Functions

inline constexpr BufferDataType(nvinfer1::DataType dataType, bool _unsigned = false, bool pointer = false)
inline constexpr operator nvinfer1::DataType() const noexcept
inline constexpr nvinfer1::DataType getDataType() const noexcept
inline constexpr bool isPointer() const noexcept
inline constexpr bool isUnsigned() const
inline constexpr std::size_t getSize() const noexcept

Public Static Attributes

static auto constexpr kTrtPointerType = nvinfer1::DataType::kINT64

Private Members

nvinfer1::DataType mDataType
bool mUnsigned
bool mPointer
template<typename T>
class BufferRange : public tensorrt_llm::common::ArrayView<T>

Public Types

using Base = tensorrt_llm::common::ArrayView<T>

Public Functions

inline BufferRange(T *data, size_type size)
template<typename U = T, std::enable_if_t<!std::is_const_v<U>, bool> = true>
inline explicit BufferRange(IBuffer &buffer)
template<typename U = T, std::enable_if_t<std::is_const_v<U>, bool> = true>
inline explicit BufferRange(IBuffer const &buffer)
template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false>
struct DataTypeTraits
#include <iBuffer.h>

For converting a TensorRT data type to a C++ data type.

template<nvinfer1::DataType kDataType, bool kUnsigned>
struct DataTypeTraits<kDataType, kUnsigned, true>

Public Types

using type = typename DataTypeTraits<kDataType, kUnsigned, false>::type*

Public Static Attributes

static char constexpr name[] = "*"
static auto constexpr size = sizeof(type)
template<bool kUnsigned> kBOOL, kUnsigned >

Public Types

using type = bool

Public Static Attributes

static char constexpr name[] = "bool"
static auto constexpr size = sizeof(type)
template<> kFLOAT >

Public Types

using type = float

Public Static Attributes

static char constexpr name[] = "float"
static auto constexpr size = sizeof(type)
template<> kHALF >

Public Types

using type = half

Public Static Attributes

static char constexpr name[] = "half"
static auto constexpr size = sizeof(type)
template<> kINT32 >

Public Types

using type = std::int32_t

Public Static Attributes

static char constexpr name[] = "int32"
static auto constexpr size = sizeof(type)
template<> kINT32, true >

Public Types

using type = std::uint32_t

Public Static Attributes

static char constexpr name[] = "uint32"
static auto constexpr size = sizeof(type)
template<> kINT64 >

Public Types

using type = std::int64_t

Public Static Attributes

static char constexpr name[] = "int64"
static auto constexpr size = sizeof(type)
template<> kINT64, true >

Public Types

using type = std::uint64_t

Public Static Attributes

static char constexpr name[] = "uint64"
static auto constexpr size = sizeof(type)
template<> kINT8 >

Public Types

using type = std::int8_t

Public Static Attributes

static char constexpr name[] = "int8"
static auto constexpr size = sizeof(type)
template<bool kUnsigned> kUINT8, kUnsigned >

Public Types

using type = std::uint8_t

Public Static Attributes

static char constexpr name[] = "uint8"
static auto constexpr size = sizeof(type)
class IBuffer

Subclassed by tensorrt_llm::runtime::ITensor

Public Types

using UniquePtr = std::unique_ptr<IBuffer>
using SharedPtr = std::shared_ptr<IBuffer>
using UniqueConstPtr = std::unique_ptr<IBuffer const>
using SharedConstPtr = std::shared_ptr<IBuffer const>
using DataType = nvinfer1::DataType

Public Functions

virtual void *data() = 0

Returns a pointer to underlying array.

virtual void const *data() const = 0

Returns a pointer to underlying array.

inline virtual void *data(std::size_t index)

Returns a pointer to the underlying array at a given element index.

inline virtual void const *data(std::size_t index) const

Returns a pointer to the underlying array at a given element index.

virtual std::size_t getSize() const = 0

Returns the size (in number of elements) of the buffer.

inline virtual std::size_t getSizeInBytes() const

Returns the size (in bytes) of the buffer.

virtual std::size_t getCapacity() const = 0

Returns the capacity of the buffer.

virtual DataType getDataType() const = 0

Returns the data type of the buffer.

virtual char const *getDataTypeName() const
virtual MemoryType getMemoryType() const = 0

Returns the memory type of the buffer.

virtual char const *getMemoryTypeName() const
virtual void resize(std::size_t newSize) = 0

Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

virtual void release() = 0

Releases the buffer. It will be reset to nullptr.

virtual ~IBuffer() = default
IBuffer(IBuffer const&) = delete

Not allowed to copy.

IBuffer &operator=(IBuffer const&) = delete

Not allowed to copy.

Public Static Functions

static UniquePtr slice(SharedPtr buffer, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying buffer. The view will have the same data type as buffer.

Parameters:
  • buffer – The buffer to view.

  • offset – The offset of the view.

  • size – The size of the view.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
static inline UniquePtr slice(SharedPtr buffer, std::size_t offset)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently resized.

Parameters:

tensor – The tensor to view.

Returns:

A view on the tensor.

static inline UniquePtr view(SharedPtr tensor, std::size_t size)

Returns a view on the underlying tensor with a different size.

Parameters:
  • tensor – The tensor to view.

  • size – The size of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, std::size_t size)
static UniquePtr wrap(void *data, DataType type, std::size_t size, std::size_t capacity)

Wraps the given data in an IBuffer. The IBuffer will not own the underlying data and cannot be resized beyond capacity.

Parameters:
  • data – The data to wrap.

  • type – The data type of the data.

  • size – The size of the buffer.

  • capacity – The capacity of the buffer.

Returns:

An IBuffer.

static inline UniquePtr wrap(void *data, DataType type, std::size_t size)
template<typename T>
static inline UniquePtr wrap(T *data, std::size_t size, std::size_t capacity)
template<typename T>
static inline UniquePtr wrap(T *data, std::size_t size)
template<typename T>
static inline UniquePtr wrap(std::vector<T> &v)
static MemoryType memoryType(void const *data)

Determine the memory type of a pointer.

Protected Functions

IBuffer() = default
inline std::size_t toBytes(std::size_t size) const

Returns an array index or size in bytes.

template<MemoryType T>
struct MemoryTypeString
template<> kCPU >

Public Static Attributes

static auto constexpr value = "CPU"
template<> kGPU >

Public Static Attributes

static auto constexpr value = "GPU"
template<> kPINNED >

Public Static Attributes

static auto constexpr value = "PINNED"
template<> kPINNEDPOOL >

Public Static Attributes

static auto constexpr value = "PINNEDPOOL"
template<> kUVM >

Public Static Attributes

static auto constexpr value = "UVM"
template<typename T, bool = false>
struct TRTDataType
#include <iBuffer.h>

For converting a C++ data type to a TensorRT data type.

template<>
struct TRTDataType<bool>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kBOOL
template<>
struct TRTDataType<float>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kFLOAT
template<>
struct TRTDataType<half>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kHALF
template<> FinishedState >

Public Static Attributes

static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value
template<> KVCacheIndex >

Public Static Attributes

static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value
template<> RequestType >

Public Static Attributes

static constexpr auto value = TRTDataType<std::underlying_type_t<runtime::RequestType>>::value
template<> int32_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32
template<> int64_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64
template<> int8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8
template<> uint32_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
template<> uint64_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
template<> uint8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8
template<typename T>
struct TRTDataType<T*>

Public Static Attributes

static auto constexpr value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}

Private Static Attributes

static auto constexpr kUnderlyingType = BufferDataType{TRTDataType<std::remove_const_t<T>, false>::value}
template<>
struct TRTDataType<void*>

Public Static Attributes

static constexpr auto value = BufferDataType::kTrtPointerType

iGptDecoderBatched.h

namespace tensorrt_llm
namespace runtime
class IGptDecoderBatched : public virtual tensorrt_llm::runtime::IStatefulGptDecoder
#include <iGptDecoderBatched.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::GptDecoderBatched

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>
using TensorPtr = std::shared_ptr<ITensor>
using DecoderFinishedEventPtr = std::unique_ptr<decoder_batch::DecoderFinishedEvent const>

Public Functions

virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) = 0

Setup buffers for ExplicitDraftTokens decoding.

virtual void setupEagle(EagleBuffers::Inputs eagleBuffers) = 0

Setup buffers for Eagle decoding.

virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) = 0

Setup buffers for Lookahead decoding.

virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) = 0

Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &token, decoder_batch::Output &output, decoder_batch::Input const &input) = 0

Call decoder forwardSync and wait for the call to forwardAsync associated with a token to complete.

virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &token) = 0

Wait for the call to forwardAsync associated with a token to complete.

inline virtual void forward(decoder_batch::Output &output, decoder_batch::Input const &input)

Run one step for all requests and wait for completion on the host.

virtual TensorPtr getIds(SizeType32 batchIdx) const = 0
Parameters:

batchIdx – index of the batch

Returns:

[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

virtual TensorPtr getGatheredIds(SizeType32 batchIdx) const = 0
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search in GptDecoderBatched It contains gathered token ids without padding, on gpu

virtual CudaEvent finalize(SizeType32 batchIdx, SamplingConfig const &samplingConfig, bool streaming) const = 0

Gather final beam search results for request batchIdx. Result will only be available after event returned.

virtual std::vector<bool> getFinished() const = 0
Returns:

[batchSize (actual)], marks finished requests (per batch)

virtual TensorPtr getFinishReasons() const = 0
Returns:

[batchSize, beamWidth], FinishedState value, on gpu

virtual TensorPtr getCumLogProbs() const = 0
Returns:

[batchSize, beamWidth], cumulative log probabilities (per beam), on gpu

virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const = 0
Returns:

[beamWidth], cumulative log probabilities (per beam) for request batchIdx, on gpu

virtual TensorPtr getLogProbs() const = 0
Returns:

[batchSize, beamWidth, maxSeqLen], log probabilities (per beam), on gpu

virtual TensorPtr getLogProbs(SizeType32 batchIdx) const = 0
Returns:

[beamWidth, maxSeqLen], cumulative log probabilities (per beam) for request batchIdx, on gpu

virtual TensorPtr getParentIds() const = 0
virtual std::vector<SizeType32> getNbSteps() const = 0
virtual executor::DecodingMode getDecodingMode() const = 0
virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs, ModelConfig const &modelConfig) = 0

Initialize batched decoder at seqSlots with a new requests.

virtual TensorPtr getNextDraftTokens() const = 0
Returns:

[batchSize, maxTokensPerStep-1], predicted draft tokens for next step, on gpu

virtual TensorPtr getPrevDraftTokensLengths() const = 0
Returns:

[batchSize], predicted draft tokens lengths for previous step, on gpu

virtual TensorPtr getNextDraftTokensLengths() const = 0
Returns:

[batchSize], predicted draft tokens lengths for next step, on gpu

virtual TensorPtr getAcceptedLengthsCumSum() const = 0
Returns:

[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu

virtual TensorPtr getAcceptedPackedPaths() const = 0
Returns:

[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu

Protected Functions

IGptDecoderBatched() = default
namespace decoder_batch

Typedefs

using Output = decoder::Output
class DecoderFinishedEvent

Public Functions

inline explicit DecoderFinishedEvent(CudaEvent &&event, std::vector<bool> const &active)

Public Members

CudaEvent event
std::vector<bool> active
class Input

Public Types

using TensorConstPtr = ITensor::SharedConstPtr
using TensorPtr = ITensor::SharedPtr

Public Functions

inline explicit Input(std::vector<TensorPtr> const &logits, std::vector<bool> const &active)
inline explicit Input(std::vector<TensorPtr> const &logits)

Public Members

std::vector<TensorPtr> logits
std::vector<bool> active
TensorPtr cacheIndirection
std::vector<std::vector<TensorPtr>> predictedDraftLogits
TensorPtr seqSlots
std::optional<ExplicitDraftTokensBuffers::EngineOutputs> explicitDraftTokensInputs
std::optional<ExplicitDraftTokensBuffers::EngineInputs> explicitDraftTokensLastInputs
std::optional<EagleBuffers::EngineOutputs> eagleInputs
std::optional<EagleBuffers::Inputs> eagleLastInputs

iStatefulGptDecoder.h

namespace tensorrt_llm
namespace batch_manager
namespace runtime
class IStatefulGptDecoder
#include <iStatefulGptDecoder.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::IGptDecoderBatched

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>
using TensorPtr = std::shared_ptr<ITensor>

Public Functions

virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) = 0

Setup the decoder before calling forward(), also calls reshapeBuffers.

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig) = 0

Initialize the decoder with new batch of inputs.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0

Run one step for all requests without blocking the host thread.

virtual void forwardSync() = 0

Wait for the last call to forwardAsync to complete.

inline virtual void forward(decoder::Output &output, decoder::Input const &input)

Run one step for all requests.

virtual void finalize(SamplingConfig const &samplingConfig) const = 0

Gather final beam search results for all requests.

virtual TensorPtr getIds() const = 0
Returns:

[batchSize, beamWidth, maxSequenceLength], all token ids, on gpu

virtual TensorPtr getGatheredIds() const = 0
Returns:

[batchSize, beamWidth, maxSequenceLength] token ids after gatherTree

virtual TensorPtr getCumLogProbs() const = 0
Returns:

[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

virtual TensorPtr getLogProbs() const = 0
Returns:

[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

virtual TensorPtr getNewTokens(SizeType32 iter = 0) const = 0

Get tokens generated in one step of last forward pass.

Parameters:

iter – The iteration within [0; maxTokensPerStep) for which to get the tokens

Returns:

[batchSize, beamWidth], tokens generated in iter (per beam), on gpu

virtual TensorPtr getAllNewTokens() const = 0

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:

[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

virtual TensorPtr getNbFinished() const = 0
Returns:

[1], number of finished sequences, in pinned host memory

virtual ~IStatefulGptDecoder() = default

Protected Functions

IStatefulGptDecoder() = default
namespace decoder
class Input

Public Types

using TensorPtr = ITensor::SharedPtr

Public Functions

inline explicit Input(TensorPtr logits)

Public Members

TensorPtr logits
TensorPtr cacheIndirection
class Output

Public Types

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

Output() = default

Public Members

TensorPtr cacheIndirection
TensorPtr sequenceLengths

iTensor.h

namespace nvinfer1
namespace tensorrt_llm
namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims)

Utility function to print a shape.

std::ostream &operator<<(std::ostream &output, ITensor const &tensor)

Utility function to print a tensor with its shape.

template<typename T>
T const *bufferCastOrNull(ITensor::SharedConstPtr const &tensorPtr)

Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.

This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.

Template Parameters:

T – The type of the underlying data.

Parameters:

tensorPtr – A possibly null shared ptr.

Returns:

A pointer to T const, possibly nullptr.

template<typename T>
T *bufferCastOrNull(ITensor::SharedPtr const &tensorPtr)

Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.

This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.

Template Parameters:

T – The type of the underlying data.

Parameters:

tensorPtr – A possibly null shared ptr.

Returns:

A pointer to T, possibly nullptr.

template<typename T>
T *bufferCastOrNull(std::optional<ITensor::SharedPtr> const &optionalTensorPtr)

Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.

This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.

Template Parameters:

T – The type of the underlying data.

Parameters:

optionalBufferPtr – A possibly empty optional.

Returns:

A pointer to T, possibly nullptr.

template<typename T>
T const *bufferCastOrNull(std::optional<ITensor::SharedConstPtr> const &optionalTensorPtr)

Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.

This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.

Template Parameters:

T – The type of the underlying data.

Parameters:

optionalBufferPtr – A possibly empty optional.

Returns:

A pointer to const T, possibly nullptr.

class ITensor : public virtual tensorrt_llm::runtime::IBuffer

Public Types

using UniquePtr = std::unique_ptr<ITensor>
using SharedPtr = std::shared_ptr<ITensor>
using UniqueConstPtr = std::unique_ptr<ITensor const>
using SharedConstPtr = std::shared_ptr<ITensor const>
using Shape = nvinfer1::Dims
using DimType64 = std::remove_reference_t<decltype(Shape::d[0])>
using TensorMap = runtime::StringPtrMap<runtime::ITensor>

Public Functions

~ITensor() override = default
virtual Shape const &getShape() const = 0

Returns the tensor dimensions.

template<SizeType32 n>
inline DimType64 getDimension() const

Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.

virtual void reshape(Shape const &dims) = 0

Sets the tensor dimensions. The new size of the tensor will be volume(dims)

inline virtual void resize(std::size_t newSize) override

Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

ITensor(ITensor const&) = delete

Not allowed to copy.

ITensor &operator=(ITensor const&) = delete

Not allowed to copy.

inline void squeeze(SizeType32 dim)

Removes the given unit dimensions from this tensor.

inline void unsqueeze(SizeType32 dim)

Adds a unit dimension at the specified position.

inline bool shapeEquals(Shape const &other) const
inline bool shapeEquals(std::initializer_list<SizeType32> const &other) const
template<typename T>
inline bool shapeEquals(T const *dims, SizeType32 count) const

Public Static Functions

static inline std::int64_t volume(Shape const &dims)

Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.

static inline std::size_t volumeNonNegative(Shape const &shape)

Returns the volume of the dimensions. Throws if d.nbDims < 0.

static inline Shape strides(Shape const &dims)

Returns the strides of each dimemsion in a Shape.

static Shape squeeze(Shape const &shape, SizeType32 dim)

Removes the given unit dimension from shape.

Parameters:
  • shape – The shape to squeeze.

  • dim – The dimension that should be removed (“squeezed”).

Returns:

A new shape without the unit dimension.

static Shape unsqueeze(Shape const &shape, SizeType32 dim)

Add a unit dimension to shape at the specified position.

Parameters:
  • shape – The shape to unsqueeze.

  • dim – The dimension where unit dimension should be added.

Returns:

A new shape with the added unit dimension.

static UniquePtr slice(SharedPtr tensor, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.

Parameters:
  • tensor – The tensor to view.

  • offset – The offset of the view w.r.t. dimension 0 of the tensor.

  • size – The size of the view w.r.t. dimension 0 of the tensor.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
static inline UniquePtr slice(SharedPtr tensor, std::size_t offset)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
static UniquePtr slice(SharedPtr tensor, Shape const &offsetDims, DimType64 size)
Parameters:
  • offsetDims – The offset in multiple dimensions.

  • tensor – The tensor to view.

  • offsetDims – The offset dimensions of the view.

  • size – The size of the view w.r.t. the last dimension in offsetDims.

  • offsetDims – specifies all dimensions.

Throws:

Whenever – offset overflows or the last dimension offset+size overflows.

Returns:

A view of shape [size, the rest dimensions] or [size] when

static inline UniquePtr slice(SharedPtr tensor, std::initializer_list<DimType64> const &offsetDims, DimType64 size)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims, std::size_t size)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims, std::size_t size)
static inline UniquePtr slice(SharedPtr tensor, Shape const &offsetDims)

return the rest slices at the last dimension when size omitted.

static inline UniquePtr slice(SharedPtr tensor, std::initializer_list<DimType64> const &offsetDims)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)
static inline UniquePtr at(SharedPtr tensor, Shape const &offsetDims)
Parameters:

offsetDims – specifies all dimensions.

Returns:

Just the block at the point, with shape of [the rest dimensions] or [1] when

static inline UniquePtr at(SharedPtr tensor, std::initializer_list<DimType64> const &offsetDims)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr at(TConstPtr &&tensor, Shape const &offsetDims)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline ITensor::UniqueConstPtr at(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)
static UniquePtr view(IBuffer::SharedPtr buffer, Shape const &dims)

Returns a view on the underlying buffer (or tensor) with the given shape.

Parameters:
  • tensor – The tensor to view.

  • shape – The shape of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)
static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently reshaped.

Parameters:

tensor – The tensor to view.

Returns:

A view on the tensor.

static inline UniquePtr flattenN(SharedPtr tensor, std::int64_t sliceN = -1)

Returns a flattened view on the underlying tensor which can be independently reshaped.

Parameters:
  • tensor – The tensor to flatten.

  • sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.

Returns:

A flatten view on the tensor.

static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)

Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.

Parameters:
  • data – The data to wrap.

  • type – The data type of the data.

  • shape – The shape of the tensor.

  • capacity – The capacity of the buffer.

Returns:

An ITensor.

static inline UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape)
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape)
template<typename T>
static inline UniquePtr wrap(std::vector<T> &v, Shape const &shape)
static Shape makeShape(std::initializer_list<DimType64> const &dims)

A convenience function to create a tensor shape with the given dimensions.

static std::string toString(Shape const &dims)

A convenience function for converting a tensor shape to a string.

static inline bool shapeEquals(Shape const &lhs, Shape const &rhs)

A convenience function to compare shapes.

template<typename T>
static inline bool shapeEquals(Shape const &lhs, T const *dims, SizeType32 count)

A convenience function to compare shapes.

Protected Functions

ITensor() = default

Protected Static Functions

static inline DimType64 castSize(size_t newSize)

Friends

friend class ITensorBindings

ipcUtils.h

namespace tensorrt_llm
namespace runtime

Functions

void lamportInitializeAll(void *buffer_0, void *buffer_1, void *buffer_2, size_t size)
class AllReduceBuffers

Public Types

using TensorPtr = ITensor::SharedPtr

Public Functions

AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, SizeType32 hiddenSize, BufferManager const &manager, WorldConfig const &worldConfig, bool const fakeBuffers = false)

Public Members

TensorPtr mAllReduceCommPtrs
std::vector<runtime::IpcMemory> mIpcMemoryHandles
class IpcMemory

Public Types

using BufferPtr = IBuffer::SharedPtr

Public Functions

IpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig, bool openIpc = true)
~IpcMemory()
IpcMemory(IpcMemory const&) = delete
IpcMemory &operator=(IpcMemory const&) = delete
IpcMemory(IpcMemory&&) = default
IpcMemory &operator=(IpcMemory&&) = default
inline std::vector<void*> const &getCommPtrs() const

Public Static Attributes

static size_t constexpr FLAGS_SIZE = (tensorrt_llm::kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)

Private Functions

void allocateIpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig)
void destroyIpcMemory()

Private Members

SizeType32 mTpRank
std::vector<void*> mCommPtrs
BufferPtr mBuffer
bool mOpenIpc

lookaheadBuffers.h

namespace tensorrt_llm
namespace runtime
class LookaheadDecodingBuffers

Public Types

using SizeType32 = runtime::SizeType32
using TensorPtr = runtime::ITensor::SharedPtr
using ITensor = tensorrt_llm::runtime::ITensor

Public Functions

LookaheadDecodingBuffers(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep, runtime::BufferManager const &bufferManager)

Public Members

TensorPtr generationLengths
TensorPtr positionOffsets
TensorPtr packedMasks
TensorPtr positionIds
class LookaheadRuntimeBuffers

Public Types

using SizeType32 = tensorrt_llm::runtime::SizeType32
using ITensor = tensorrt_llm::runtime::ITensor
using TensorPtr = runtime::ITensor::SharedPtr
using TensorMap = runtime::StringPtrMap<runtime::ITensor>

Public Functions

LookaheadRuntimeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, executor::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const &requestTypes, ITensor const &seqSlots, LookaheadDecodingBuffers const &decoderLookaheadBuffers, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 tokensPerStep)
void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const

Public Members

TensorPtr cumSumLength
TensorPtr packedMasksDevice
TensorPtr generationLengthsDevice
TensorPtr positionOffsetsDevice
TensorPtr positionIdsDevice
TensorPtr packedMaskHost
TensorPtr generationLengthsHost
TensorPtr positionOffsetsHost
TensorPtr positionIdsHost
TensorPtr packedMaskHostCopy
TensorPtr generationLengthsHostCopy
TensorPtr positionOffsetsHostCopy
TensorPtr positionIdsHostCopy
TensorPtr batchSlotsHostCopy

lookaheadModule.h

namespace tensorrt_llm
namespace runtime
class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule

Public Functions

inline explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
inline explicit LookaheadModule() noexcept
inline void setExecutionConfig(executor::LookaheadDecodingConfig const &config)
inline executor::LookaheadDecodingConfig const getExecutionConfig() const

Private Members

executor::LookaheadDecodingConfig mExecutionConfig

loraCache.h

namespace tensorrt_llm
namespace runtime

Functions

std::string to_string(LoraCache::TaskLayerModuleConfig const &v)
std::ostream &operator<<(std::ostream &os, LoraCache::TaskLayerModuleConfig const &v)
class LoraCache
#include <loraCache.h>

LoraCache

Caches LoRA weights with LRU eviction policy.

Tasks put in the cache are marked in progress and can not be evicted, until they are marked done.

A cache page holds a optimally sized LoRA. A page is of size [numSlots x pageWidth] An optimally size LoRA is on that has the configured optimalAdapterSize.

Conceptually a slot corresponds to a r=1, 1-layer, 1-module set of in/out weights. Page width is set to the number of weights in smallest module.

The number of slots per page is then ceilDiv(num weights in optimally sized LoRA, num weights in smallest module)

Cache pages are allocated on one or more blocks

Public Types

using TensorPtr = ITensor::SharedPtr
using TaskIdType = std::uint64_t
using TaskLayerModuleConfigListPtr = std::shared_ptr<std::vector<TaskLayerModuleConfig>>

Public Functions

LoraCache(LoraCachePageManagerConfig const &pageManagerConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, BufferManager const &bufferManager)

param[in] pageManagerConfig: a LoraCachePageManagerConfig param[in] modelConfig: a ModelConfig param[in] worldConfig: a WorldConfig param[in] bufferManager: a BufferManager only used to allocate page blocks

void put(TaskIdType taskId, TensorPtr weights, TensorPtr config, bool load = true)

put a task in the cache, and claim pages for it, and optionally load task weights.

Parameters:
  • taskId[in] the task id

  • weights[in] lora weights tensor

  • config[in] lora config tensor

  • load[in] if true load weights before returning, otherwise do not

void loadWeights(TaskIdType taskId, TensorPtr weights, TensorPtr config)

load task weights. This method must be called after put. It is designed to be called asynchronously after put returns with load = false

Parameters:
  • taslId[in] the task id

  • weights[in] lora weights tensor

  • config[in] lora config tensor

inline bool isLoaded(TaskIdType taskId) const
Parameters:

taskId[in] the task id

Returns:

&#8212; true if task is loaded (weights are in place) and false otherwise

bool isDone(TaskIdType taskId) const
Parameters:

taskId[in] the task id

Returns:

&#8212; true if task is marked done and can be evicted

inline bool has(TaskIdType taskId) const
Parameters:

taskId[in] the task id

Returns:

&#8212; true if task is in the cache (not necessarily loaded) and false otherwise

std::vector<TaskLayerModuleConfig> const &get(TaskIdType taskId)
Parameters:

taskId[in] the task id

Returns:

&#8212; list of Value objects with pointers to task weights

void bump(TaskIdType taskId)

bump task and make it the most recently used

Parameters:

taskId[in] the task id

void markTaskDone(TaskIdType taskId)

mark task done meaning it can be evicted

Parameters:

taskId[in] the task id

void markAllDone()

mark all tasks in cache done

SizeType32 determineNumPages(TaskIdType taskId) const
Parameters:

taskId[in] the taskid

Returns:

&#8212; number of pages needed to store the given task

SizeType32 determineNumPages(TensorPtr config) const
Parameters:

config[in] lora config tensor

Returns:

&#8212; number of pages needed to store the task configured with config tensor

bool fits(TensorPtr config) const
Parameters:

config[in] a lora config tensor

Returns:

&#8212; true in task fits in cache false otherwise

void copyTask(TaskIdType taskId, LoraCache &deviceCache, bool markDone = false)

copy task to another cache. Caches must have the same page size.

Parameters:
  • taskId[in] the task id to copy

  • otherCache[in] the LoraCache to move the task to

  • markDone[in] mark the copied task done as it’s copied

SizeType32 getNumPages() const
Returns:

&#8212; total number of pages allocated to cache (used or not)

ITensor::SharedConstPtr getPagePtr(size_t pageId) const
Parameters:

pageId[in] the page id

Returns:

&#8212; const pointer to page

Public Static Functions

static std::vector<LoraCache::TaskLayerModuleConfig> copyToPages(TensorPtr weights, TensorPtr config, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::unordered_map<SizeType32, LoraModule> moduleIdToModel, BufferManager const &manager, std::vector<TensorPtr> const &pages, std::vector<std::size_t> const &pageIds)

Copy task weights to cache pages.

Parameters:
  • weights[in] task weights

  • config[in] task config tensor

  • modelConfig[in] a ModelConfig

  • worldConfig[in] a WorldConfig

  • modelIdToModel[in] map from lora module id to LoraModule

  • manager[in] a BufferManager the manager to use to perform the copies

  • pages[out] list of page tensors to copy weights to

  • pageIds[in] page ids for the pages

Returns:

&#8212; list of cache Values objects

static void splitTransposeCpu(ITensor &output, ITensor const &input, SizeType32 tpSize, SizeType32 tpRank)

splits second dim of input into tpSize parts and writes the tpRank split to output

Parameters:
  • output[out] output tensor

  • input[in] input tensor

  • tpSize[in] number of splits

  • tpRank[in] the split to write to output

Private Types

enum ValueStatus

Values:

enumerator kVALUE_STATUS_MISSING
enumerator kVALUE_STATUS_PROCESSING
enumerator kVALUE_STATUS_LOADED
using TaskValuePtr = std::shared_ptr<TaskValue>

Private Functions

void loadWeights(TaskValue &cacheValue, TensorPtr weights, TensorPtr config)
void bumpTaskInProgress(TaskIdType taskId)
ValueStatus getStatus(TaskIdType taskId) const
std::vector<std::size_t> claimPagesWithEvict(SizeType32 numPages)

claim numPages, evicting tasks if needed

Parameters:

numPages[in] number of pages to claim

Throws:

std::runtime_error – if all pages cannot be claimed

Returns:

&#8212; list of page ids

std::map<size_t, std::pair<size_t, SizeType32>> copyTaskMapPages(TaskValue &targetTaskValue, TaskValue const &sourceTaskValue, std::vector<size_t> const &targetPageIds, LoraCache const &targetCache)

Internal helper method used inside copyTask. Not thread safe on its own

Private Members

LoraCachePageManagerConfig mPageManagerConfig
ModelConfig mModelConfig
WorldConfig mWorldConfig
mutable std::mutex mPagesMutex
std::unique_ptr<LoraCachePageManager> mCachePageManager
mutable std::mutex mCacheMutex
std::unordered_map<TaskIdType, TaskValuePtr> mCacheMap
std::list<TaskIdType> mInProgressTasks
std::list<TaskIdType> mDoneTasks
std::vector<std::unique_ptr<BufferManager>> mDeviceBufferManagers
std::unique_ptr<BufferManager> mBufferManager
std::unordered_map<SizeType32, LoraModule> mModuleIdToModule

Private Static Functions

template<typename T>
static void splitTransposeCpuInner(ITensor &output, ITensor const &input, SizeType32 tpSize, SizeType32 tpRank)
struct TaskLayerModuleConfig
#include <loraCache.h>

Contains information on a single layer / module. A list of these configs is associated with each task and can be used to populate runtime tensors.

Public Functions

std::string toString() const
bool operator==(LoraCache::TaskLayerModuleConfig const &o) const

Public Members

std::size_t pageId
SizeType32 slotIdx
SizeType32 inSize
SizeType32 outSize
SizeType32 moduleId
SizeType32 layerId
SizeType32 adapterSize
SizeType32 numSlots
std::int64_t weightsInPointer
std::int64_t weightsOutPointer

Friends

friend class TaskLayerModuleConfigBindings
struct TaskValue

Holds configuration and state for a single task.

Public Functions

TaskValue() = delete
~TaskValue() = default
inline TaskValue(std::vector<std::size_t> const &pageIds, TaskLayerModuleConfigListPtr const &configs, std::list<TaskIdType>::iterator it, bool inProgress, bool loaded, bool done, bool loadInProgress = false)
inline TaskValue(TaskValue &&o) noexcept
inline TaskValue &operator=(TaskValue &&o)

Public Members

std::vector<std::size_t> pageIds
TaskLayerModuleConfigListPtr configs
std::list<TaskIdType>::iterator it
bool inProgress
bool loaded
bool done

Marks a task a done. This is used to mark a task as done during loading. if done=true at the end of loading (end of put, loadweights, or copyTask) the task will be marked as done

bool loadInProgress

Indicates weights are loading either in put or loadWeights This is used to block concurrent loadWeights calls for the same task.

class LoraCacheFullException : public tensorrt_llm::runtime::LoraExpectedException

Public Functions

explicit LoraCacheFullException(std::string const &msg)
~LoraCacheFullException() noexcept override
class LoraCachePageManager
#include <loraCache.h>

Holds memory of lora cache pages, and manages allocation and freeing of whole pages. Memory is pre-allocated either on the host or device

Note that this class is not thread safe

Public Types

using TensorPtr = ITensor::SharedPtr

Public Functions

LoraCachePageManager(LoraCachePageManagerConfig const &config, BufferManager const &bufferManager)
Parameters:
std::optional<std::vector<std::size_t>> claimPages(SizeType32 numPages)

claim pages

Parameters:

numPages[in] number of pages to claim

Returns:

a tuple, where the first values is a boolean indicating whether pages were claimed. If the first value is true the second value will have a list of pageIds

SizeType32 numAvailablePages() const

get number of available (free) pages in manager

Returns:

number of free pages in manager

void releasePages(std::vector<std::size_t> const &pages)

release given pages

Parameters:

pages[in] list of pages to release (free)

ITensor::SharedConstPtr blockPtr(SizeType32 blockIdx) const

return pointer to given page block

Parameters:

blockIdx;[in]

Returns:

&#8212; pointer to page block

ITensor::SharedConstPtr pagePtr(std::size_t pageIdx) const

return pointer to given page

Parameters:

pageIdx[in]

Returns:

&#8212; const pointer to page

ITensor::SharedPtr mutablePagePtr(std::size_t pageIdx)

return pointer to given page

Parameters:

pageIdx[in]

Returns:

&#8212; mutable pointer to page

Private Functions

void initialize(BufferManager const &bufferManager)

Private Members

std::vector<TensorPtr> mPageBlocks
std::deque<std::size_t> mFreePageIds
std::vector<std::uint8_t> mIsPageFree
LoraCachePageManagerConfig const mConfig
class LoraExpectedException : public std::runtime_error

Subclassed by tensorrt_llm::runtime::LoraCacheFullException

Public Functions

explicit LoraExpectedException(std::string const &msg)
~LoraExpectedException() noexcept override

loraCachePageManagerConfig.h

namespace tensorrt_llm
namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &os, LoraCachePageManagerConfig const &c)
inline std::string to_string(LoraCachePageManagerConfig const &c)
class LoraCachePageManagerConfig
#include <loraCachePageManagerConfig.h>

Configuration for LoraCachePageManager

See LoraCache docs for description of pages, slots, and page blocks.

Public Functions

inline explicit constexpr LoraCachePageManagerConfig(runtime::MemoryType memType, nvinfer1::DataType dType, SizeType32 totalNumPages, SizeType32 maxPagesPerBlock, SizeType32 slotsPerPage, SizeType32 pageWidth, SizeType32 numCopyStreams)
inline runtime::MemoryType constexpr getMemoryType() const noexcept
inline void constexpr setMemoryType(runtime::MemoryType const &memoryType) noexcept
inline nvinfer1::DataType constexpr getDataType() const noexcept
inline void constexpr setDataType(nvinfer1::DataType const &dtype) noexcept
inline SizeType32 constexpr getTotalNumPages() const noexcept
inline void constexpr setTotalNumPage(SizeType32 const &totalNumPages) noexcept
inline SizeType32 constexpr getMaxPagesPerBlock() const noexcept
inline void constexpr setMaxPagesPerBlock(SizeType32 const &maxPagesPerBlock) noexcept
inline SizeType32 constexpr getSlotsPerPage() const noexcept
inline void constexpr setSlotsPerPage(SizeType32 const &slotsPerPage) noexcept
inline SizeType32 constexpr getPageWidth() const noexcept
inline void constexpr setPageWidth(SizeType32 const &pageWidth) noexcept
inline bool constexpr getInitToZero() const noexcept
inline void constexpr setInitToZero(bool initToZero) noexcept
inline SizeType32 constexpr getNumCopyStreams() const noexcept
inline void constexpr setNumCopyStreams(SizeType32 numCopyStreams) noexcept

Private Members

runtime::MemoryType mMemoryType
nvinfer1::DataType mDataType
SizeType32 mTotalNumPages
SizeType32 mMaxPagesPerBlock
SizeType32 mSlotsPerPage
SizeType32 mPageWidth
SizeType32 mNumCopyStreams = 1
bool mInitToZero

loraModule.h

namespace tensorrt_llm
namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &output, LoraModule const &module)
class LoraModule

Public Types

enum class ModuleType : SizeType32

Values:

enumerator kINVALID
enumerator kATTN_QKV
enumerator kATTN_Q
enumerator kATTN_K
enumerator kATTN_V
enumerator kATTN_DENSE
enumerator kMLP_H_TO_4H
enumerator kMLP_4H_TO_H
enumerator kMLP_GATE
enumerator kCROSS_ATTN_QKV
enumerator kCROSS_ATTN_Q
enumerator kCROSS_ATTN_K
enumerator kCROSS_ATTN_V
enumerator kCROSS_ATTN_DENSE
enumerator kMOE_H_TO_4H
enumerator kMOE_4H_TO_H
enumerator kMOE_GATE
enumerator kMOE_ROUTER
enumerator kMLP_ROUTER
using TensorPtr = ITensor::SharedPtr

Public Functions

inline explicit constexpr LoraModule(ModuleType const &t, SizeType32 inDim, SizeType32 outDim, bool inDimFirst, bool outDimFirst, SizeType32 inTpSplitDim, SizeType32 outTpSplitDim) noexcept
inline explicit constexpr LoraModule() noexcept
explicit constexpr LoraModule(LoraModule const &o) = default
constexpr LoraModule &operator=(LoraModule const &o) = default
inline SizeType32 constexpr flattenedInOutSize(SizeType32 adapterSize) const noexcept
inline SizeType32 constexpr inSize(SizeType32 adapterSize) const noexcept
inline SizeType32 constexpr outSize(SizeType32 adapterSize) const noexcept
inline SizeType32 constexpr localInSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
inline SizeType32 constexpr localOutSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
inline SizeType32 constexpr localInDim(SizeType32 tpSize) const noexcept
inline SizeType32 constexpr localOutDim(SizeType32 tpSize) const noexcept
inline SizeType32 constexpr localInAdapterSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
inline SizeType32 constexpr localOutAdapterSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
inline SizeType32 constexpr localInOutSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
inline SizeType32 constexpr value() const noexcept
inline std::string_view constexpr name() const noexcept
inline SizeType32 constexpr inDim() const noexcept
inline SizeType32 constexpr outDim() const noexcept
inline bool constexpr inDimFirst() const noexcept
inline bool constexpr outDimFirst() const noexcept
inline SizeType32 constexpr inTpSplitDim() const noexcept
inline SizeType32 constexpr outTpSplitDim() const noexcept

Public Static Functions

static std::vector<LoraModule> createLoraModules(std::vector<std::string> const &loraModuleNames, SizeType32 hiddenSize, SizeType32 mlpHiddenSize, SizeType32 numAttentionHeads, SizeType32 numKvAttentionHeads, SizeType32 attentionHeadSize, SizeType32 tpSize, SizeType32 numExperts)
static inline ModuleType constexpr toModuleType(std::string_view const &name)
static inline std::string_view constexpr toModuleName(ModuleType t) noexcept
static inline std::string_view constexpr toModuleName(SizeType32 id)

Private Members

ModuleType mType
SizeType32 mInDim
SizeType32 mOutDim
bool mInDimFirst
bool mOutDimFirst
SizeType32 mInTpSplitDim
SizeType32 mOutTpSplitDim

medusaModule.h

namespace tensorrt_llm
namespace runtime
class MedusaModule : public tensorrt_llm::runtime::SpeculativeDecodingModule

Public Types

using TensorPtr = ITensor::SharedPtr
using MedusaChoices = std::vector<std::vector<SizeType32>>

Public Functions

inline explicit MedusaModule(SizeType32 maxAcceptedTokens, SizeType32 maxDraftTokens) noexcept
inline explicit MedusaModule() noexcept
inline MedusaChoices const &getMedusaChoices() const noexcept

Private Members

MedusaChoices mDefaultMedusaChoices = {{0}, {0, 0}, {1}, {0, 1}, {2}, {0, 0, 0}, {1, 0}, {0, 2}, {3}, {0, 3}, {4}, {0, 4}, {2, 0}, {0, 5}, {0, 0, 1}, {5}, {0, 6}, {6}, {0, 7}, {0, 1, 0}, {1, 1}, {7}, {0, 8}, {0, 0, 2}, {3, 0}, {0, 9}, {8}, {9}, {1, 0, 0}, {0, 2, 0}, {1, 2}, {0, 0, 3}, {4, 0}, {2, 1}, {0, 0, 4}, {0, 0, 5}, {0, 0, 0, 0}, {0, 1, 1}, {0, 0, 6}, {0, 3, 0}, {5, 0}, {1, 3}, {0, 0, 7}, {0, 0, 8}, {0, 0, 9}, {6, 0}, {0, 4, 0}, {1, 4}, {7, 0}, {0, 1, 2}, {2, 0, 0}, {3, 1}, {2, 2}, {8, 0}, {0, 5, 0}, {1, 5}, {1, 0, 1}, {0, 2, 1}, {9, 0}, {0, 6, 0}, {0, 0, 0, 1}, {1, 6}, {0, 7, 0}}

memoryCounters.h

namespace tensorrt_llm
namespace runtime
class MemoryCounters

Public Types

using SizeType32 = std::size_t
using DiffType = std::ptrdiff_t

Public Functions

MemoryCounters() = default
inline SizeType32 getGpu() const
inline SizeType32 getCpu() const
inline SizeType32 getPinned() const
inline SizeType32 getUVM() const
inline SizeType32 getPinnedPool() const
inline DiffType getGpuDiff() const
inline DiffType getCpuDiff() const
inline DiffType getPinnedDiff() const
inline DiffType getUVMDiff() const
inline DiffType getPinnedPoolDiff() const
template<MemoryType T>
inline void allocate(SizeType32 size)
void allocate(MemoryType memoryType, SizeType32 size)
template<MemoryType T>
inline void deallocate(SizeType32 size)
void deallocate(MemoryType memoryType, SizeType32 size)
std::string toString() const

Public Static Functions

static MemoryCounters &getInstance()
static std::string bytesToString(SizeType32 bytes, int precision = 2)
static std::string bytesToString(DiffType bytes, int precision = 2)

Private Members

std::atomic<SizeType32> mGpu = {}
std::atomic<SizeType32> mCpu = {}
std::atomic<SizeType32> mPinned = {}
std::atomic<SizeType32> mUVM = {}
std::atomic<SizeType32> mPinnedPool = {}
std::atomic<DiffType> mGpuDiff = {}
std::atomic<DiffType> mCpuDiff = {}
std::atomic<DiffType> mPinnedDiff = {}
std::atomic<DiffType> mUVMDiff = {}
std::atomic<DiffType> mPinnedPoolDiff = {}

modelConfig.h

namespace tensorrt_llm
namespace runtime
class ModelConfig

Public Types

enum class ModelVariant : std::int32_t

Values:

enumerator kGpt
enumerator kChatGlm
enumerator kGlm
enumerator kMamba
enumerator kRecurrentGemma
enumerator kEncDec
enum class LayerType : std::int32_t

Values:

enumerator kATTENTION
enumerator kRECURRENT
enumerator kLINEAR
enumerator kNOOP
enum class KVCacheType : std::int32_t

Values:

enumerator kCONTINUOUS
enumerator kPAGED
enumerator kDISABLED
enum class ManageWeightsType : std::int32_t

Values:

enumerator kDisabled
enumerator kEnabled

Public Functions

inline explicit ModelConfig(SizeType32 vocabSize, SizeType32 nbLayers, SizeType32 nbAttentionLayers, SizeType32 nbRnnLayers, SizeType32 nbHeads, SizeType32 hiddenSize, nvinfer1::DataType dtype)
inline SizeType32 constexpr getVocabSize() const noexcept
inline SizeType32 constexpr getVocabSizePadded(SizeType32 worldSize) const noexcept
inline SizeType32 countLocalLayers(LayerType layerType, SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
inline SizeType32 countLowerRankLayers(LayerType layerType, SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
inline SizeType32 getNbLayers(SizeType32 pipelineParallelism = 1) const
inline SizeType32 getNbAttentionLayers(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
inline SizeType32 getNbRnnLayers(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
inline SizeType32 constexpr getNbHeads() const noexcept
inline SizeType32 getNbKvHeads(SizeType32 layerIdx) const
inline void setNbKvHeads(SizeType32 nbKvHeads)
inline void setNbCrossKvHeads(SizeType32 nbKvHeads)
inline SizeType32 constexpr getHiddenSize() const noexcept
inline SizeType32 constexpr getEncoderHiddenSize() const noexcept
inline void constexpr setEncoderHiddenSize(SizeType32 encoderHiddenSize) noexcept
inline SizeType32 constexpr getSizePerHead() const noexcept
inline void constexpr setSizePerHead(SizeType32 sizePerHead) noexcept
inline nvinfer1::DataType constexpr getDataType() const noexcept
inline bool constexpr useGptAttentionPlugin() const noexcept
inline void constexpr useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept
inline bool constexpr useMambaConv1dPlugin() const noexcept
inline void constexpr useMambaConv1dPlugin(bool useMambaConv1dPlugin) noexcept
inline bool constexpr usePackedInput() const noexcept
inline void constexpr usePackedInput(bool inputPacked) noexcept
inline bool constexpr usePagedState() const noexcept
inline void constexpr usePagedState(bool pagedState) noexcept
inline SizeType32 constexpr getTokensPerBlock() const noexcept
inline void constexpr setTokensPerBlock(SizeType32 TokensPerBlock) noexcept
inline common::QuantMode constexpr getQuantMode() const noexcept
inline void constexpr setQuantMode(common::QuantMode QuantMode) noexcept
inline bool constexpr supportsInflightBatching() const noexcept
inline SizeType32 constexpr getMaxBatchSize() const noexcept
inline void constexpr setMaxBatchSize(SizeType32 maxBatchSize) noexcept
inline SizeType32 constexpr getMaxBeamWidth() const noexcept
inline void constexpr setMaxBeamWidth(SizeType32 maxBeamWidth) noexcept
inline SizeType32 constexpr getMaxInputLen() const noexcept
inline void constexpr setMaxInputLen(SizeType32 maxInputLen) noexcept
inline SizeType32 constexpr getMaxSequenceLen() const noexcept
inline void constexpr setMaxSequenceLen(SizeType32 maxSequenceLen) noexcept
inline std::optional<SizeType32> constexpr getMaxNumTokens() const noexcept
inline void constexpr setMaxNumTokens(std::optional<SizeType32> maxNumTokens) noexcept
inline SizeType32 constexpr getMaxEncoderLen() const noexcept
inline void constexpr setMaxEncoderLen(SizeType32 maxEncoderLen) noexcept
inline bool constexpr usePromptTuning() const noexcept
inline bool constexpr useMrope() const noexcept
inline void constexpr setUseMrope(bool useMrope) noexcept
inline SizeType32 constexpr getMaxPositionEmbeddings() const noexcept
inline void constexpr setMaxPositionEmbeddings(SizeType32 maxPositionEmbeddings) noexcept
inline SizeType32 constexpr getRotaryEmbeddingDim() const noexcept
inline void constexpr setRotaryEmbeddingDim(SizeType32 rotaryEmbeddingDim) noexcept
inline SizeType32 constexpr getMaxPromptEmbeddingTableSize() const noexcept
inline void constexpr setMaxPromptEmbeddingTableSize(SizeType32 maxPromptEmbeddingTableSize) noexcept
inline bool constexpr computeContextLogits() const noexcept
inline void constexpr computeContextLogits(bool computeContextLogits) noexcept
inline bool constexpr computeGenerationLogits() const noexcept
inline void constexpr computeGenerationLogits(bool computeGenerationLogits) noexcept
inline ModelVariant getModelVariant() const
inline void setModelVariant(ModelVariant modelVariant)
inline SizeType32 getMaxDecodingDraftTokens() const
inline SizeType32 constexpr getMaxDecodingTokens() const noexcept
inline void constexpr setContextFMHA(bool contextFMHA) noexcept
inline bool constexpr getContextFMHA() const noexcept
inline void constexpr setPagedContextFMHA(bool pagedContextFMHA) noexcept
inline bool constexpr getPagedContextFMHA() const noexcept
inline void constexpr setPpReduceScatter(bool ppReduceScatter) noexcept
inline bool constexpr getPpReduceScatter() const noexcept
inline bool constexpr useLoraPlugin() const noexcept
inline void constexpr useLoraPlugin(bool useLoraPlugin) noexcept
inline std::vector<LoraModule> const &getLoraModules() const noexcept
inline void setLoraModules(std::vector<LoraModule> const &loraModules) noexcept
inline SizeType32 constexpr getMlpHiddenSize() const noexcept
inline void constexpr setMlpHiddenSize(SizeType32 mlpHiddenSize) noexcept
inline bool constexpr isKVCacheEnabled() const noexcept
inline bool constexpr isPagedKVCache() const noexcept
inline bool constexpr isContinuousKVCache() const noexcept
inline KVCacheType constexpr getKVCacheType() const noexcept
inline void constexpr setKVCacheType(KVCacheType kvCacheType) noexcept
inline bool constexpr useCrossAttention() const noexcept
inline void constexpr setUseCrossAttention(bool useCrossAttention) noexcept
inline bool constexpr usePositionEmbedding() const noexcept
inline void constexpr setUsePositionEmbedding(bool usePositionEmbedding) noexcept
inline bool constexpr useTokenTypeEmbedding() const noexcept
inline void constexpr setUseTokenTypeEmbedding(bool useTokenTypeEmbedding) noexcept
inline SizeType32 constexpr getMaxLoraRank() const noexcept
inline void constexpr setMaxLoraRank(SizeType32 maxLoraRank) noexcept
inline void setSpeculativeDecodingMode(SpeculativeDecodingMode mode) noexcept
inline bool hasSpeculativeDecodingModule() const noexcept
inline SpeculativeDecodingModule const &getSpeculativeDecodingModule() const noexcept
inline std::shared_ptr<SpeculativeDecodingModule const> getSpeculativeDecodingModulePtr() const noexcept
inline std::shared_ptr<SpeculativeDecodingModule> getSpeculativeDecodingModulePtr() noexcept
inline void setSpeculativeDecodingModule(std::shared_ptr<SpeculativeDecodingModule> const &speculativeDecodingModule) noexcept
inline nvinfer1::DataType getKvDataType() const noexcept
inline bool constexpr isTransformerBased() const noexcept
inline bool hasRnnConfig() const noexcept
inline std::optional<RnnConfig> getRnnConfig() const noexcept
inline void setRnnConfig(RnnConfig const &rnnConfig) noexcept
inline bool constexpr isRnnBased() const noexcept
inline std::vector<LayerType> const &getLayerTypes() const noexcept
inline void setLayerTypes(std::vector<LayerType> const &layerTypes) noexcept
inline SpeculativeDecodingMode constexpr getSpeculativeDecodingMode() const noexcept
inline void setLogitsDtype(nvinfer1::DataType inputDtype) noexcept
inline nvinfer1::DataType constexpr getLogitsDtype() const noexcept
inline void setUseShapeInference(bool useShapeInference) noexcept
inline bool useShapeInference() const noexcept
inline ManageWeightsType getManageWeightsType() const noexcept
inline void setManageWeightsType(const ManageWeightsType manageWeightType) noexcept
inline std::string const &getModelName() const noexcept
inline void setModelName(std::string const &modelName)
inline std::vector<SizeType32> const &getNumKvHeadsPerLayer() const
inline std::pair<std::vector<SizeType32>::const_iterator, std::vector<SizeType32>::const_iterator> getNumKvHeadsPerLayerLocalRange(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0, bool isCrossAttention = false) const
inline void setNumKvHeadsPerLayer(std::vector<SizeType32> const &headsPerLayer)
inline void setNumKvHeadsPerCrossLayer(std::vector<SizeType32> const &headsPerLayer)
inline SizeType32 getSumLocalKvHeads(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0, bool isCrossAttention = false) const
inline bool constexpr skipCrossAttnBlocks() const noexcept
inline void constexpr setSkipCrossAttnBlocks(bool skipCrossAttnBlocks) noexcept

Public Static Functions

static inline KVCacheType KVCacheTypeFromString(std::string value)
static inline std::vector<SizeType32> getOptProfilesSplitPoints() noexcept

Public Static Attributes

static constexpr std::array kOPT_PROFILES_SPLIT_POINTS = {64, 128, 256, 512, 1024}
static constexpr SizeType32 kDEFAULT_NUM_TOKENS_PER_BLOCK = 64

Private Members

SizeType32 mVocabSize
SizeType32 mNbLayers
SizeType32 mNbAttentionLayers
SizeType32 mNbRnnLayers
SizeType32 mNbHeads
SizeType32 mHiddenSize
SizeType32 mSizePerHead
nvinfer1::DataType mDataType
bool mUseGptAttentionPlugin
bool mUseMambaConv1dPlugin
bool mInputPacked
bool mPagedState
SizeType32 mTokensPerBlock
common::QuantMode mQuantMode
SizeType32 mMaxBatchSize
SizeType32 mMaxBeamWidth
SizeType32 mMaxInputLen
SizeType32 mMaxSequenceLen
std::optional<SizeType32> mMaxNumTokens
bool mComputeContextLogits
bool mComputeGenerationLogits
ModelVariant mModelVariant
SizeType32 mMaxPromptEmbeddingTableSize
bool mUseMrope
SizeType32 mMaxPositionEmbeddings
SizeType32 mRotaryEmbeddingDim
bool mContextFMHA
bool mPagedContextFMHA
bool mUseXQA
bool mPpReduceScatter
bool mUseLoraPlugin
std::vector<LoraModule> mLoraModules
SizeType32 mMlpHiddenSize
SizeType32 mMaxLoraRank
std::optional<RnnConfig> mRnnConfig
KVCacheType mKVCacheType = KVCacheType::kCONTINUOUS
SizeType32 mMaxEncoderLen = {}
SizeType32 mEncoderHiddenSize = {}
bool mUseCrossAttention
bool mUsePositionEmbedding
bool mUseTokenTypeEmbedding
std::vector<LayerType> mLayerTypes
std::shared_ptr<SpeculativeDecodingModule> mSpeculativeDecodingModule
SpeculativeDecodingMode mSpeculativeDecodingMode
nvinfer1::DataType mLogitsDtype
bool mUseShapeInference
ManageWeightsType mManageWeightsType
std::string mModelName
std::vector<SizeType32> mNumKvHeadsPerAttentionLayer
std::vector<SizeType32> mNumKvHeadsPerCrossAttentionLayer
bool mSkipCrossAttnBlocks
struct RnnConfig

Public Members

SizeType32 stateSize = 0
SizeType32 convKernel = 0
SizeType32 rnnHiddenSize = 0
SizeType32 rnnHeadSize = 0
SizeType32 rnnConvDimSize = 0

promptTuningParams.h

namespace tensorrt_llm
namespace runtime
template<typename TTensor>
class GenericPromptTuningParams

Public Types

using TensorPtr = TTensor
using SizeType32 = tensorrt_llm::runtime::SizeType32

Public Functions

inline explicit GenericPromptTuningParams(TensorPtr embeddingTable = TensorPtr(), TensorPtr tasks = TensorPtr(), TensorPtr vocabSize = TensorPtr())

Public Members

TensorPtr embeddingTable
TensorPtr tasks
TensorPtr vocabSize
std::vector<bool> promptTuningEnabled
class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<ITensor::SharedPtr>

Public Types

using TensorPtr = ITensor::SharedPtr
using SizeType32 = GenericPromptTuningParams::SizeType32

Public Functions

inline explicit PromptTuningParams(TensorPtr embeddingTable = nullptr, TensorPtr tasks = nullptr, TensorPtr vocabSize = nullptr)
void fillTasksTensor(TensorPtr tasksHost, const SizeType32 batchSize, const SizeType32 numContextRequests, std::vector<SizeType32> const &reqBeamWidths, std::vector<SizeType32> const &reqPromptLengths, BufferManager const &manager, bool packedInput)

rawEngine.h

namespace tensorrt_llm
namespace runtime
class RawEngine

Public Types

enum Type

Values:

enumerator FilePath
enumerator AddressWithSize
enumerator HostMemory

Public Functions

inline explicit RawEngine(std::filesystem::path enginePath) noexcept
inline explicit RawEngine(void const *engineAddr, std::size_t engineSize) noexcept
inline explicit RawEngine(nvinfer1::IHostMemory const *engineBuffer) noexcept
inline Type getType() const
inline std::filesystem::path getPath() const
inline std::optional<std::filesystem::path> getPathOpt() const
inline void setPath(std::filesystem::path enginePath)
inline std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> const &getManagedWeightsMapOpt() const
inline void setManagedWeightsMap(std::map<std::string, tensorrt_llm::executor::Tensor> managedWeightsMap)
inline void const *getAddress() const
inline std::size_t getSize() const
inline nvinfer1::IHostMemory const *getHostMemory() const

Public Members

void const *mEngineAddr = {}
std::size_t mEngineSize = {}

Private Members

Type mType
std::optional<std::filesystem::path> mEnginePath
struct tensorrt_llm::runtime::RawEngine
nvinfer1::IHostMemory const *mEngineBuffer = {}
std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> mManagedWeightsMap

request.h

namespace tensorrt_llm
namespace runtime
namespace decoder_batch
class Request

Public Types

using TensorConstPtr = ITensor::SharedConstPtr
using TensorPtr = ITensor::SharedPtr
using BufferPtr = IBuffer::SharedPtr

Public Functions

inline explicit Request(TensorConstPtr ids, SizeType32 inputLen, std::optional<SizeType32> maxNewTokens = std::nullopt, std::optional<SizeType32> endId = std::nullopt)

Public Members

TensorConstPtr ids
SizeType32 inputLen
std::optional<SizeType32> maxNewTokens
std::optional<SizeType32> endId
BufferPtr draftTokens
std::optional<TensorPtr> draftLogits
TensorPtr embeddingBias
TensorPtr badWordsList
TensorPtr stopWordsList
SizeType32 generatedTokensPerEngineStep
TensorPtr medusaPaths
TensorPtr medusaTreeIds
std::optional<executor::LookaheadDecodingConfig> lookaheadRuntimeConfig
std::optional<executor::EagleConfig> eagleConfig
nvinfer1::DataType dtype

runtimeDefaults.h

namespace tensorrt_llm
namespace runtime
struct RuntimeDefaults

Public Functions

inline RuntimeDefaults(std::optional<std::vector<SizeType32>> maxAttentionWindowVec, std::optional<SizeType32> sinkTokenLength)
RuntimeDefaults() = default

Public Members

std::optional<std::vector<SizeType32>> maxAttentionWindowVec
std::optional<SizeType32> sinkTokenLength

samplingConfig.h

Defines

SET_FROM_OPTIONAL(varName, VarName, VarType)
namespace tensorrt_llm
namespace runtime
class SamplingConfig

Public Functions

inline explicit SamplingConfig(SizeType32 beamWidth = 1)
inline explicit SamplingConfig(std::vector<SamplingConfig> const &configs)
inline explicit SamplingConfig(executor::SamplingConfig const &samplingConfig, std::optional<executor::ExternalDraftTokensConfig> const &externalDraftTokensConfig)
inline bool validate()
inline bool operator==(SamplingConfig const &other) const
inline SizeType32 getNumReturnBeams() const

Public Members

SizeType32 beamWidth
std::optional<SizeType32> numReturnSequences
OptVec<FloatType> temperature
OptVec<FloatType> originalTemperature
OptVec<SizeType32> minLength
OptVec<FloatType> repetitionPenalty
OptVec<FloatType> presencePenalty
OptVec<FloatType> frequencyPenalty
OptVec<SizeType32> noRepeatNgramSize
OptVec<bool> outputLogProbs
OptVec<bool> cumLogProbs
OptVec<SizeType32> topK
OptVec<FloatType> topP
OptVec<uint64_t> randomSeed
OptVec<FloatType> topPDecay
OptVec<FloatType> topPMin
OptVec<TokenIdType> topPResetIds
OptVec<FloatType> beamSearchDiversityRate
OptVec<FloatType> lengthPenalty
OptVec<SizeType32> earlyStopping
OptVec<FloatType> draftAcceptanceThreshold
OptVec<std::vector<runtime::SizeType32>> topKMedusaHeads
std::optional<bool> normalizeLogProbs

Private Types

using FloatType = float
template<typename T>
using OptVec = std::optional<std::vector<T>>
template<typename T>
using Vec = std::vector<T>

Private Functions

template<typename T>
inline bool validateVec(std::string name, OptVec<T> const &vec, T min, std::optional<T> max = std::nullopt)

Private Static Functions

template<typename T>
static inline OptVec<T> fuseValues(std::vector<SamplingConfig> const &configs, std::function<OptVec<T>(size_t ci)> accessor, T defaultValue)

speculativeDecodingMode.h

namespace tensorrt_llm
namespace runtime
class SpeculativeDecodingMode

Public Types

using UnderlyingType = std::uint8_t

Public Functions

inline bool constexpr isNone() const
inline bool constexpr isDraftTokensExternal() const
inline bool constexpr isMedusa() const
inline bool constexpr isLookaheadDecoding() const
inline bool constexpr isExplicitDraftTokens() const
inline bool constexpr isEagle() const
inline bool constexpr updatesPositionIds() const
inline bool constexpr requiresAttentionMask() const
inline bool constexpr predictsDraftTokens() const
inline bool constexpr needsKVCacheRewind() const
inline bool constexpr variableDraftLength() const
inline bool constexpr hasDraftLogits() const
inline bool constexpr needsDecoderPrologue() const
inline bool operator==(SpeculativeDecodingMode const &other) const
inline explicit constexpr SpeculativeDecodingMode(UnderlyingType state)

Public Static Functions

static inline auto constexpr None()
static inline auto constexpr DraftTokensExternal()
static inline auto constexpr Medusa()
static inline auto constexpr LookaheadDecoding()
static inline auto constexpr ExplicitDraftTokens()
static inline auto constexpr Eagle()

Private Functions

inline bool constexpr anyBitSet(UnderlyingType bits) const
inline bool constexpr allBitSet(UnderlyingType bits) const

Private Members

UnderlyingType mState = {kNone}

Private Static Attributes

static UnderlyingType constexpr kNone = {1U << 0U}
static UnderlyingType constexpr kDraftTokensExternal = {1U << 1U}
static UnderlyingType constexpr kMedusa = {1U << 2U}
static UnderlyingType constexpr kLookaheadDecoding = {1U << 3U}
static UnderlyingType constexpr kExplicitDraftTokens = {1U << 4U}
static UnderlyingType constexpr kEagle = {1U << 5U}

speculativeDecodingModule.h

namespace tensorrt_llm
namespace runtime
class SpeculativeDecodingModule

Subclassed by tensorrt_llm::runtime::LookaheadModule, tensorrt_llm::runtime::MedusaModule

Public Functions

inline explicit SpeculativeDecodingModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens, SizeType32 maxNumPaths) noexcept
inline explicit SpeculativeDecodingModule() noexcept
virtual ~SpeculativeDecodingModule() = default
SpeculativeDecodingModule(SpeculativeDecodingModule const &o) = default
SpeculativeDecodingModule &operator=(SpeculativeDecodingModule const &o) = default
inline SizeType32 getMaxDraftPathLen() const noexcept
Returns:

max number of draft tokens that can be accepted by one step of the decoder

inline SizeType32 getMaxPathLen() const noexcept

one more than draft path len for prediction from primary head

Returns:

max number of tokens that a request can grow in one step of the decoder

inline SizeType32 getMaxDecodingDraftTokens() const noexcept
Returns:

max number of draft tokens processed by one step of the decoder

inline SizeType32 getMaxDecodingTokens() const noexcept

one more than decoding draft tokens for prediction from primary head

Returns:

max number of tokens processed by one step of the decoder

inline SizeType32 getNumPackedMasks() const noexcept
inline SizeType32 getMaxNumPaths() const noexcept
inline void setMaxDraftTokens(SizeType32 maxDraftTokens) noexcept
inline void setMaxDraftPathLen(SizeType32 maxDraftPathLen) noexcept
inline void setMaxNumPaths(SizeType32 maxNumPaths) noexcept

Private Functions

inline void computeNumPackedMasks() noexcept

Private Members

SizeType32 mMaxDraftPathLen
SizeType32 mMaxDecodingDraftTokens
SizeType32 mMaxNumPaths
SizeType32 mMaxNumPackedMasks

tllmLogger.h

namespace tensorrt_llm
namespace runtime
class TllmLogger : public nvinfer1::ILogger

Public Functions

void log(Severity severity, nvinfer1::AsciiChar const *msg) noexcept override
Severity getLevel()
void setLevel(Severity level)

worldConfig.h

namespace tensorrt_llm
namespace runtime
class WorldConfig

Public Functions

explicit WorldConfig(SizeType32 tensorParallelism = 1, SizeType32 pipelineParallelism = 1, SizeType32 contextParallelism = 1, SizeType32 rank = 0, SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)
inline SizeType32 constexpr getSize() const noexcept
inline SizeType32 constexpr getTensorParallelism() const noexcept
inline bool constexpr isTensorParallel() const noexcept
inline SizeType32 constexpr getPipelineParallelism() const noexcept
inline bool constexpr isPipelineParallel() const noexcept
inline SizeType32 constexpr getContextParallelism() const noexcept
inline bool constexpr isContextParallel() const noexcept
inline SizeType32 constexpr getRank() const noexcept
inline SizeType32 constexpr getGpusPerNode() const noexcept
inline SizeType32 getGpusPerGroup() const noexcept
inline SizeType32 getDevice() const noexcept
inline SizeType32 getDeviceOf(SizeType32 rank) const noexcept
inline SizeType32 constexpr getPipelineParallelRank() const noexcept
inline SizeType32 constexpr getTensorParallelRank() const noexcept
inline SizeType32 constexpr getContextParallelRank() const noexcept
inline SizeType32 constexpr getLocalRank() const noexcept
inline SizeType32 constexpr getNodeRank() const noexcept
inline SizeType32 constexpr getNodeRankOf(SizeType32 rank) const noexcept
inline bool constexpr isFirstPipelineParallelRank() const noexcept
inline bool constexpr isLastPipelineParallelRank() const noexcept

Is my rank the last rank in its pipeline?

inline bool constexpr isFirstTensorParallelRank() const noexcept
inline bool constexpr isFirstContextParallelRank() const noexcept
inline SizeType32 constexpr getLastRank() const noexcept
std::vector<SizeType32> getPipelineParallelGroup() const
std::vector<SizeType32> getTensorParallelGroup() const
std::vector<SizeType32> getContextParallelGroup() const
bool validMpiConfig() const

Public Static Functions

static WorldConfig mpi(SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType32> tensorParallelism = std::nullopt, std::optional<SizeType32> pipelineParallelism = std::nullopt, std::optional<SizeType32> contextParallelism = std::nullopt, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)

Public Static Attributes

static SizeType32 constexpr kDefaultGpusPerNode = 1

Private Members

SizeType32 mTensorParallelism
SizeType32 mPipelineParallelism
SizeType32 mContextParallelism
SizeType32 mRank
SizeType32 mGpusPerNode
std::vector<SizeType32> mDeviceIds