Runtime

bufferManager.h

namespace tensorrt_llm

namespace runtime

class BufferManager

#include <bufferManager.h>

A helper class for managing memory on host and device.

Public Types

using IBufferPtr = IBuffer::UniquePtr 

using ITensorPtr = ITensor::UniquePtr 

using CudaStreamPtr = std::shared_ptr<CudaStream>

Public Functions

explicit BufferManager(CudaStreamPtr stream, bool trimPool = false)

Construct a BufferManager.

Parameters:: cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).

inline ~BufferManager(): Destructor.

IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.

ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.

IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an IBuffer of the given size and memory type.

ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an ITensor of the given dimensions and memory type.

inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const: Create an empty IBuffer of the given memory type. It may be resized later.

inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const: Create an empty ITensor of the given memory type. It may be reshaped later.

void setMem(IBuffer &buffer, int32_t value) const: Set the contents of the given buffer to value.

void setZero(IBuffer &buffer) const: Set the contents of the given buffer to zero.

void copy(void const *src, IBuffer &dst, MemoryType srcType) const: Copy src to dst.

void copy(IBuffer const &src, void *dst, MemoryType dstType) const: Copy src to dst.

inline void copy(void const *src, IBuffer &dst) const: Copy src to dst.

inline void copy(IBuffer const &src, void *dst) const: Copy src to dst.

void copy(IBuffer const &src, IBuffer &dst) const: Copy src to dst.

IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const: Copy src into a new IBuffer with a potentially different memory type.

ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const: Copy src into a new ITensor with a potentially different memory type.

template<typename T> inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const: Copy src into a new IBuffer with a potentially different memory type.

template<typename T> inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const: Copy src into a new ITensor with a potentially different memory type.

template<typename T> inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const: Copy src into a new ITensor with a potentially different memory type.

CudaStream const &getStream() const: Get the underlying cuda stream.

std::size_t memoryPoolReserved() const: The current size of the memory reserved by the memory pool.

std::size_t memoryPoolUsed() const: The current size of the memory used by the memory pool.

std::size_t memoryPoolFree() const: The current size of the memory free in the memory pool.

void memoryPoolTrimTo(std::size_t size): Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.

Public Static Functions

static IBufferPtr gpuSync(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): Allocates an IBuffer of the given size on the GPU, using cudaMalloc.

static ITensorPtr gpuSync(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.

static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): Allocates an IBuffer of the given size on the CPU.

static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): Allocates an ITensor of the given dimensions on the CPU.

static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): Allocates a pinned IBuffer of the given size on the CPU.

static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): Allocates a pinned ITensor of the given dimensions on the CPU.

static IBufferPtr pinnedPool(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.

static ITensorPtr pinnedPool(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.

static IBufferPtr managed(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): Allocates an IBuffer of the given size in UVM.

static ITensorPtr managed(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): Allocates an ITensor of the given dimensions in UVM.

Public Static Attributes

static constexpr auto kBYTE_TYPE = nvinfer1::DataType::kUINT8

Private Members

CudaStreamPtr mStream

bool const mTrimPool

Private Static Functions

static void initMemoryPool(int device)

static std::size_t memoryPoolReserved(int device)

static std::size_t memoryPoolUsed(int device)

static inline std::size_t memoryPoolFree(int device)

static void memoryPoolTrimTo(int device, std::size_t size)

Friends

friend class ::BufferManagerTest

common.h

Defines

FMT_DIM

namespace tensorrt_llm

namespace runtime

Typedefs

using SizeType32 = std::int32_t

using TokenIdType = std::int32_t

template<typename T> using StringPtrMap = std::unordered_map<std::string, std::shared_ptr<T>>

cudaEvent.h

namespace tensorrt_llm

namespace runtime

class CudaEvent

Public Types

using pointer = cudaEvent_t

Public Functions

inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)

Creates a new cuda event. The event will be destroyed in the destructor.

Parameters:: flags – Flags for event creation. By default, event timing is disabled.

inline explicit CudaEvent(pointer event, bool ownsEvent = true)

Pass an existing cuda event to this object.

Parameters:

event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.

inline pointer get() const: Returns the event associated with this object.

inline void synchronize() const: Synchronizes the event.

Private Types

using element_type = std::remove_pointer_t<pointer>

using EventPtr = std::unique_ptr<element_type, Deleter>

Private Members

EventPtr mEvent

class Deleter

Public Functions

inline explicit Deleter(bool ownsEvent)

inline explicit Deleter()

inline constexpr void operator()(pointer event) const

Private Members

bool mOwnsEvent

cudaStream.h

namespace tensorrt_llm

namespace runtime

class CudaStream

Public Functions

inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)

Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.

Parameters:

flags – Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See ::cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.

inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)

Pass an existing cuda stream to this object.

Parameters:

stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.

inline explicit CudaStream(cudaStream_t stream): Construct with an existing cuda stream or the default stream by passing nullptr.

inline int getDevice() const: Returns the device on which the stream was created.

inline cudaStream_t get() const: Returns the stream associated with this object.

inline void synchronize() const: Synchronizes the stream.

inline void record(CudaEvent::pointer event) const: Record an event on the stream.

inline void record(CudaEvent const &event) const: Record an event on the stream.

inline void wait(CudaEvent::pointer event) const: Wait for an event.

inline void wait(CudaEvent const &event) const: Wait for an event.

Private Types

using StreamPtr = std::unique_ptr<std::remove_pointer_t<cudaStream_t>, Deleter>

Private Members

StreamPtr mStream

int mDevice = {-1}

class Deleter

Public Functions

inline explicit Deleter(bool ownsStream)

inline explicit Deleter()

inline constexpr void operator()(cudaStream_t stream) const

Private Members

bool mOwnsStream

decodingInput.h

namespace tensorrt_llm

namespace runtime

class DecodingInput

Public Types

using TensorPtr = std::shared_ptr<ITensor const>

Public Functions

inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize, TensorPtr logits, TensorPtr endIds)

Public Members

SizeType32 step

SizeType32 maxLength

SizeType32 maxAttentionWindow

SizeType32 sinkTokenLength

SizeType32 batchSize

SizeType32 maxStopWordsLen

SizeType32 maxBadWordsLen

TensorPtr logits

std::optional<std::vector<TensorPtr>> logitsVec

TensorPtr endIds

TensorPtr finished

TensorPtr sequenceLimitLength

TensorPtr embeddingBias

TensorPtr lengths

TensorPtr badWordsList

TensorPtr badWordsPtrs

TensorPtr badWordsLens

TensorPtr stopWordsList

TensorPtr stopWordsPtrs

TensorPtr stopWordsLens

TensorPtr noRepeatNgramSize

TensorPtr batchSlots

TensorPtr cacheIndirection

std::optional<MedusaInputs> medusaInputs

std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs

class ExplicitDraftTokensInputs

Public Members

TensorPtr nextDraftTokens

TensorPtr nextFlatTokens

TensorPtr nextDraftIndices

TensorPtr nextDraftProbs

TensorPtr lastDraftTokens

TensorPtr lastDraftIndices

TensorPtr masks

TensorPtr packedPositionIds

TensorPtr bestPathLengths

TensorPtr bestPathIndices

TensorPtr nextGenerationLengths

TensorPtr lastPositionIdsBase

TensorPtr lastGenerationLengths

TensorPtr maxGenLengthDevice

TensorPtr seqSlots

class MedusaInputs

Public Members

TensorPtr medusaPaths

TensorPtr medusaTreeIds

std::vector<std::vector<TensorPtr>> medusaLogits

TensorPtr medusaCurTokensPerStep

TensorPtr medusaTargetTokensPerStep

decodingOutput.h

namespace tensorrt_llm

namespace runtime

class DecodingOutput

Public Types

using TensorPtr = ITensor::SharedPtr 

Public Functions

inline explicit DecodingOutput(TensorPtr ids)

Public Members

TensorPtr ids

TensorPtr newTokensSteps

TensorPtr newTokens

std::vector<TensorPtr> newTokensVec

TensorPtr finished

TensorPtr finishedSum

TensorPtr logProbs

TensorPtr cumLogProbs

TensorPtr parentIds

TensorPtr lengths

TensorPtr cacheIndirection

BeamHypotheses beamHypotheses

std::optional<SpeculativeDecodingOutputs> speculativeDecodingOutputs

std::optional<ExplicitDraftTokensBuffers::Inputs> explicitDraftTokensBuffers

Public Static Attributes

static constexpr float kNegativeInfinity = -1e20f

class BeamHypotheses

Public Functions

void empty(BufferManager &manager)

void reshape(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSequenceLength)

void release()

void init(BufferManager &manager, TokenIdType endId)

BeamHypotheses slice(SizeType32 batchIndex, SizeType32 size) const

Public Members

TensorPtr outputIdsCBA

TensorPtr logProbsCBA

TensorPtr sequenceLengthsCBA

TensorPtr cumLogProbsCBA

TensorPtr normedScoresCBA

TensorPtr numBeamsCBA

TensorPtr minNormedScoresCBA

TensorPtr batchDones

class SpeculativeDecodingOutputs

Public Members

TensorPtr nextDraftTokens

TensorPtr nextDraftTokensLen

TensorPtr prevDraftTokensLen

TensorPtr acceptedTokensLen

TensorPtr acceptedLengthsCumSum

TensorPtr pathsOffsets

explicitDraftTokensBuffers.h

namespace tensorrt_llm

namespace runtime

class ExplicitDraftTokensBuffers

Public Types

using SizeType32 = runtime::SizeType32 

using ITensor = runtime::ITensor 

using BufferPtr = runtime::IBuffer::SharedPtr 

using TensorPtr = runtime::ITensor::SharedPtr 

using TensorMap = runtime::StringPtrMap<runtime::ITensor>

Public Functions

ExplicitDraftTokensBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, executor::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)

void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const &modelConfig)

void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const &requestTypes, ITensor const &seqSlots, ExplicitDraftTokensBuffers::Inputs const &decoderBuffers, ITensor const &contextPositionIds, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const

void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const

Public Members

tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs engineInputs

class tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs engineOutputs

std::size_t scanTempStorageBytes = {0}

BufferPtr scanTempStorage

TensorPtr cumSumGenerationLengths

Private Functions

template<typename T> void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 vocabSizePadded, ITensor const &seqSlots, ExplicitDraftTokensBuffers::Inputs const &draftBuffers, ITensor const &contextPositionIds, runtime::ExplicitDraftTokensModule const &explicitDraftTokensModule, runtime::CudaStream const &stream) const

class EngineInputs : public tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs 

Public Members

TensorPtr requestTypesDevice: [numSequences], on gpu

TensorPtr positionOffsets: [numGenSequences]

class EngineOutputs

Public Members

TensorPtr nextGenerationLengths: [batchSize]

TensorPtr nextPositionOffsets: [batchSize]

TensorPtr masks: [batchSize, maxDecodingTokens, maxDecodingTokens], bool

TensorPtr nextDraftTokens: [batchSize, maxNumPaths, maxPathLen]

TensorPtr nextDraftIndices: [batchSize, maxNumPaths, maxPathLen]

TensorPtr nextDraftProbs: [batchSize, maxNumPaths, maxDraftPathLen, vocabSize]

TensorPtr nextFlatTokens: [batchSize * maxDecodingTokens]

TensorPtr bestPathLengths: [batchSize]

TensorPtr bestPathIndices: [batchSize]

TensorPtr maxGenToken: [1]

TensorPtr totalGenToken: [1]

TensorPtr packedPositionIds: [batchSize * maxDecodingTokens]

class Inputs

Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs

Public Functions

void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)

Public Members

TensorPtr temperatures: [batchSize]

TensorPtr positionIdsBase: [batchSize]

TensorPtr generationLengths: [batchSize] or [numGenSequences]

TensorPtr randomDataSample: [batchSize]

TensorPtr randomDataValidation: [batchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]

TensorPtr draftTokens: [batchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]

TensorPtr draftIndices: [batchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]

TensorPtr draftProbs: [batchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]

TensorPtr packedMasks: [batchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]

TensorPtr positionIds: [batchSize] or [numGenSequences]

TensorPtr maxGenLengthHost

generationInput.h

namespace tensorrt_llm

namespace runtime

template<typename TTensor, typename PromptTuningParams> class GenericGenerationInput

#include <generationInput.h>

endId, is the token ID that marks the end of the input sequence (aka EOS or end-of-sequence). It’s 50,256 for the GPT2 model which has a vocabulary of 50,257 tokens, for example,
padId, is the token ID that is used for padding (i.e. fills in the slots that are at an index greater-or-equal to the input length for padded sequences). It can be set to the same value as endId,
ids, is the tensor of input IDs. That tensor must be allocated on the GPU. When the input tensor is padded, the shape of ids is [batchSize, maxInputLength], where batchSize and maxInputLength must respect the maximum sizes in sessionConfig passed to the GptSession constructor. When the input is packed, the shape of ids is [numTokens], where numTokens is the sum of the lengths of the different sequences in the batch,
lengths, is the tensor of input sequence lengths. That tensor must be allocated on the GPU and contain batchSize values,
packed, indicates if the ids tensor is packed or padded. In this release, that flag must match the value passed to the constructor through the instance of the ModelConfig class. In a future release, the session may be made more flexible and automatically pad or pack the input,

embeddingBiasOpt, is a tensor of floating-point values on the GPU that contains the bias to add to the logits during sampling (after the projection from hidden states to logits as the last step of the model). This tensor must have vocabSize elements (as defined in the modelConfig argument passed to the constructor),
badWordsList, is a tensor of integers on the GPU that encodes the list of words that have to be banned from generated sequences. Its shape is [2, badWordsLength], as explained below, or [batchSize, 2, badWordsLength] when there is a different list for each sequence in the batch,
stopWordsList, is a tensor of integers on the GPU that encodes the list of words that trigger the end of the generation for a sequence. Its shape is [2, stopWordsLength], as explained below, or [batchSize, 2, stopWordsLength] when there is a different list for each sequence in the batch,
maxNewTokens, is the maximum number of tokens to generate.

The badWordsList and stopWordsList tensors have the same shape [2, length]. Let’s consider an example with three words to describe the representation of those lists. The first word contains tokens [5, 7, 3], the second one contains [9, 2] and the third one is composed of tokens [6, 2, 4, 1]. In total, there are 9 tokens. That’s the length. The shape of the tensor is [2, 9]. The first row of the tensor must contain the 9 token IDs and the second row must store the inclusive prefix-sum of the word lengths as shown on the following diagram:

   0           3       5              9
   |           |       |              |
   V           V       V              V
[  5,  7,  3,  9,  2,  6,  2,  4,  1]
[  3,  5,  9, -1, -1, -1, -1, -1, -1]

In case all the words are made of a single token, the inner-most dimension of the tensor must be increased by 1 (i.e. the length for 4 words, each made of a single token, must be 5 instead of 4 — the shape is [2, 5]).

Public Types

using TensorPtr = TTensor 

Public Functions

inline explicit GenericGenerationInput(SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

Public Members

SizeType32 endId

SizeType32 padId

TensorPtr ids

TensorPtr lengths

bool packed

TensorPtr embeddingBias

TensorPtr badWordsList

TensorPtr stopWordsList

std::optional<SizeType32> maxNewTokens

PromptTuningParams promptTuningParams

class GenerationInput : public tensorrt_llm::runtime::GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>

Public Types

using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>

using TensorPtr = Base::TensorPtr

Public Functions

inline explicit GenerationInput(SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

generationOutput.h

namespace tensorrt_llm

namespace runtime

template<typename TTensor> class GenericGenerationOutput

#include <generationOutput.h>

ids, is a tensor that contains the output token IDs. Its shape is [batchSize, beamWidth, maxSeqLength] where maxSeqLength is the sum of maxInputLength and maxNewTokens. After generation, it contains, for each sequence, a copy of the input tokens followed by the output tokens. When a sequence is shorter than maxSeqLength, padding tokens are added at the end of the sequence.

Note that the shape of that tensor is different in this version of TensorRT-LLM from its shape in previous versions where it was .

logProbs, is a tensor of floating-point values on the GPU to store the log-prob of the generated tokens. Its shape is [maxNewTokens, batchSize, beamWidth]. Its shape will likely change in a future release to match the shape of the output ids tensor.
contextLogits, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the context. Its shape is [batchSize, maxSequenceLength, vocabSizePadded]. If use remove_input_padding, its shape is [packedSize, vocabSizePadded]. This buffer will only be filled in if the TensorRT engine was built with the gather_context_logits or gather_all_token_logits parameter enabled.

After inference is complete, you can get the context logits in GenerationOutput.contextLogits, these are variables on the GPU. For specific acquisition methods, please refer to the example of gptSessionBenchmark.cpp.

It is important to point out that enabling the computation may have an impact on performance (the language modeling head (LM head) has to perform a matrix multiplication on all the context tokens instead of a just the last one).
generationLogits, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the generation. Its shape is [batchSize, beamWidth, maxOutputLen, vocabSizePadded]. This buffer will only be filled in if the TensorRT engine was built with the gather_generation_logits or gather_all_token_logits parameter enabled.

Generation logits can also be obtained through GenerationOutput.generationLogits after inference is completed.
onTokenGenerated, is a callback function invoked in the generation loop to pass newly generated tokens to the caller while the loop continues to execute. An implementation of that callback must accept the output ids tensor, the generation step and a boolean flag that indicates if the generation is complete.

Public Types

using TensorPtr = TTensor 

using Callback = std::function<void(TensorPtr const &ids, SizeType32 step, bool finished)>

Public Functions

inline explicit GenericGenerationOutput(TensorPtr ids, TensorPtr lengths)

Public Members

TensorPtr ids

TensorPtr lengths

TensorPtr cumLogProbs

TensorPtr logProbs

TensorPtr contextLogits

TensorPtr generationLogits

Callback onTokenGenerated

class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<ITensor::SharedPtr>

Public Types

using Base = GenericGenerationOutput<ITensor::SharedPtr>

using TensorPtr = Base::TensorPtr

Public Functions

inline explicit GenerationOutput(TensorPtr ids, TensorPtr lengths)

gptDecoder.h

namespace tensorrt_llm

namespace layers

namespace runtime

class IGptDecoder

Subclassed by tensorrt_llm::runtime::GptDecoder< T >

Public Types

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

virtual ~IGptDecoder() = default

virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType32 const *batchSlots = nullptr, std::optional<DecodingOutput> const &output = std::nullopt) = 0

virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) = 0

virtual void forwardSync(DecodingOutput &output, DecodingInput const &input) = 0

virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager, std::optional<std::reference_wrapper<SamplingConfig const>> samplingConfig = std::nullopt) = 0

virtual SamplingConfig const &getSamplingConfig() = 0

Public Static Functions

static void acceptDraftTokensByIds(ITensor const &targetTokenIds, ITensor const &draftTokenIds, ITensor const &contextLengths, ITensor const &numDraftTokens, ITensor &sequenceLengths, ITensor const &finishedVec, ITensor &finishedFinal, ITensor &finishedSum, ITensor const &batchSlots, BufferManager::CudaStreamPtr const &stream)

static void acceptDraftTokensByLogits(ITensor &draftLogits, ITensor const &targetLogits, ITensor &draftProbs, ITensor &targetProbs, ITensor const &numDraftTokens, ITensor &finished, ITensor const &batchSlots, SizeType32 vocabSize, SizeType32 vocabSizePadded, bool useRandomAcceptThreshold, float randomAcceptThreshold, curandState_t *curandState, BufferManager::CudaStreamPtr const &stream)

static inline std::unique_ptr<IGptDecoder> create(executor::DecodingMode const &mode, nvinfer1::DataType dtype, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength, BufferManager::CudaStreamPtr const &stream, std::shared_ptr<SpeculativeDecodingModule const> speculativeDecodingModule = nullptr)

template<typename T> class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder 

Public Types

using CudaStreamPtr = BufferManager::CudaStreamPtr 

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

GptDecoder(executor::DecodingMode const &mode, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength, CudaStreamPtr const &stream, std::shared_ptr<SpeculativeDecodingModule const> speculativeDecodingModule = nullptr)

virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType32 const *batchSlots = nullptr, std::optional<DecodingOutput> const &output = std::nullopt) override

virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) override

virtual void forwardSync(DecodingOutput &output, DecodingInput const &input) override

virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager, std::optional<std::reference_wrapper<SamplingConfig const>> samplingConfig = std::nullopt) override

inline virtual SamplingConfig const &getSamplingConfig() override

Private Members

BufferManager mManager

std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer

TensorPtr mLogProbsTiled

SamplingConfig mSamplingConfig

size_t mMaxBatchSize

executor::DecodingMode mDecodingMode

gptDecoderBatch.h

namespace tensorrt_llm

namespace runtime

class GptDecoderBatch : public tensorrt_llm::runtime::IGptDecoderBatch 

#include <gptDecoderBatch.h>

GPT decoder class with support for in-flight batching.

Public Types

enum class ForwardType

Values:

enumerator kASYNC

enumerator kSYNC

using CudaStreamPtr = std::shared_ptr<CudaStream>

using TensorPtr = ITensor::SharedPtr 

using SharedConstPtr = ITensor::SharedConstPtr 

Public Functions

GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream, SpeculativeDecodingMode const &speculativeDecodingMode)

virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, bool fusedDecoder, nvinfer1::DataType dtype, ModelConfig const &modelConfig) override: Setup the decoder before calling forward()

virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) override: Setup buffers for ExplicitDraftTokens decoding.

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) override: Initialize the decoder with new batch of inputs.

virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs) override: Initialize batched decoder at seqSlots with a new requests.

virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override: Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::Token const &token) override: Wait for the call to forwardAsync associated with a token to complete.

virtual void forwardSync(decoder_batch::Token const &token, decoder_batch::Output &output, decoder_batch::Input const &input) override: Call decoder forwardSync and wait for the call to forwardAsync associated with a token to complete.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override: Run one step for all requests without blocking the host thread.

virtual void forwardSync() override: Wait for the last call to forwardAsync to complete.

inline virtual std::vector<bool> getFinished() const override

Returns:: [batchSize], indicators of finished requests

inline virtual TensorPtr getOutputIds(SizeType32 batchIdx) const override

Parameters:: batchIdx – index of the batch
Returns:: [maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

inline virtual TensorPtr getOutputIds() const override

Returns:: [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu

virtual CudaEvent finalize(SizeType32 batchSlot, SamplingConfig const &samplingConfig) const override: Gather final beam search results for request batchSlot. Result will only be available after event returned.

virtual void finalize() const override: Gather final beam search results for all requests.

inline virtual TensorPtr getParentIds() const override

Returns:: [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu

inline virtual TensorPtr getCumLogProbs() const override

Returns:: [batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const override

Returns:: [maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs() const override

Returns:: [batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs(SizeType32 batchIdx) const override

Returns:: [maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getAllNewTokens() const override

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:: [maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

inline virtual TensorPtr getNewTokens(SizeType32 iter = 0) const override

Get tokens generated in one step of last forward pass.

Parameters:: iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
Returns:: [batchSize, beamWidth], tokens generated in iter (per beam), on gpu

inline virtual std::vector<SizeType32> getNbSteps() const override

Returns:: [batchSize], the number of generation steps executed on each request

inline virtual TensorPtr getNbFinished() const override

Returns:: [1], number of finished sequences, in pinned host memory

inline virtual TensorPtr getNextDraftTokens() const override

Returns:: [batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu

inline virtual TensorPtr getPrevDraftTokensLengths() const override

Returns:: [batchSize], predicted draft tokens lengths for previous step, on gpu

inline virtual TensorPtr getNextDraftTokensLengths() const override

Returns:: [batchSize], predicted draft tokens lengths for next step, on gpu

inline virtual TensorPtr getAcceptedLengthsCumSum() const override

Returns:: [batchSize + 1], exclusive sum of accepted draft token lengths, on gpu

inline virtual TensorPtr getAcceptedPackedPaths() const override

Returns:: [batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu

inline virtual executor::DecodingMode getDecodingMode() const override

Private Types

using GptDecoderPtr = std::unique_ptr<IGptDecoder>

using DecodingInputPtr = std::unique_ptr<DecodingInput>

using DecodingOutputPtr = std::unique_ptr<DecodingOutput>

Private Functions

CudaEvent postProcessRequest(SizeType32 batchIdx, std::optional<std::reference_wrapper<SamplingConfig const>> samplingConfig = std::nullopt) const: Gather final beam search results for request batchIdx.

void newRequest(SizeType32 batchSlot, decoder_batch::Request const &request, SamplingConfig const &samplingConfig): Initialize the decoder at batchSlot with a new request.

void allocateSpeculativeDecodingBuffers(): Allocate buffers for speculative decoding.

void setupSpeculativeDecoding(ModelConfig const &modelConfig): Setup buffers for speculative decoding.

void newRequestSpeculativeDecoding(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig): Setups decoder internal tensors for new speculative decoding request.

void newRequestDraftTokensExternal(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig): Setups decoder internal tensors for new request in Draft model Sps mode.

void newRequestMedusa(SizeType32 batchIdx, decoder_batch::Request const &request): Setups decoder internal tensors for new Medusa request.

void newRequestLookahead(SizeType32 batchIdx, decoder_batch::Request const &request): Setups decoder internal tensors for new Lookahead request.

void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::Request const &request): Setups decoder internal tensors for new Explicit draft tokens request.

void updateFinished(decoder_batch::Token const &token): Updates finished state on host for all active requests.

void setExplicitDraftTokensInputs(decoder_batch::Input const &input): Sets inputs for explicit draft tokens.

void forwardDispatch(decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType): Calls unfused or fused decoders for tokens per engine step.

void forwardUnfusedDecoder(SizeType32 step, decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType): Calls unfused decoder for whole batch in loop.

void forwardFusedDecoder(SizeType32 step, decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType): Calls fused decoder for whole batch.

Private Members

std::size_t const mVocabSize

std::size_t const mVocabSizePadded

CudaStreamPtr mStream

BufferManager mBufferManager

TokenPtr mForwardToken

CudaEvent mForwardEvent

std::vector<CudaStreamPtr> mStreams

std::vector<GptDecoderPtr> mDecoders

std::vector<DecodingInputPtr> mDecodingInputs

std::vector<DecodingOutputPtr> mDecodingOutputs

DecodingInputPtr mJointDecodingInput

DecodingOutputPtr mJointDecodingOutput

std::vector<bool> mAcceptByLogits

TensorPtr mNumDraftTokens

TensorPtr mCurandStates

std::vector<SizeType32> mNbSteps

std::vector<bool> mFinished

TensorPtr mFinishedSum

std::vector<SizeType32> mMaxNewTokens

std::vector<SizeType32> mBeamWidths

std::vector<SizeType32> mNumDecodingEngineTokens

TensorPtr mFinishedSteps

TensorPtr mDraftProbs

TensorPtr mTargetProbs

TensorPtr mDraftTokenIds

TensorPtr mDraftLogits

TensorPtr mBatchSlotsSetup

TensorPtr mBatchSlotsDecoder

TensorPtr mBatchSlotsAcceptTokens

TensorPtr mBatchSlotsAcceptLogits

TensorPtr mTargetLogitsPtrs

SizeType32 mMaxSequenceLength = {}

SizeType32 mMaxAttentionWindow = {}

SizeType32 mSinkTokenLength = {}

SizeType32 mActualBatchSize = {}

SizeType32 mMaxStopWordsLen = {}

SizeType32 mMaxBadWordsLen = {}

SizeType32 mMaxDecodingDecoderTokens = {}

SizeType32 mMaxDecodingEngineTokens = {}

bool mFusedDecoder = {false}

SpeculativeDecodingMode mSpeculativeDecodingMode

executor::DecodingMode mDecodingMode = {executor::DecodingMode::Auto()}

gptJsonConfig.h

namespace tensorrt_llm

namespace runtime

class GptJsonConfig

Public Functions

inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 gpusPerNode, ModelConfig modelConfig)

inline ModelConfig const &getModelConfig() const

inline ModelConfig &getModelConfigMutable()

inline std::string const &getName() const

inline std::string const &getVersion() const

inline std::string const &getPrecision() const

inline constexpr SizeType32 getTensorParallelism() const

inline constexpr SizeType32 getPipelineParallelism() const

inline constexpr SizeType32 getGpusPerNode() const

inline constexpr SizeType32 getWorldSize() const

std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const

inline std::string engineFilename(WorldConfig const &worldConfig) const

Public Static Functions

static GptJsonConfig parse(std::string const &json)

static GptJsonConfig parse(std::istream &json)

static GptJsonConfig parse(std::filesystem::path const &path)

Private Members

std::string const mName

std::string const mVersion

std::string const mPrecision

SizeType32 const mTensorParallelism

SizeType32 const mPipelineParallelism

SizeType32 const mGpusPerNode

ModelConfig mModelConfig

gptSession.h

namespace tensorrt_llm

namespace batch_manager

namespace kv_cache_manager

namespace runtime

class GptSession

Public Types

using LoggerPtr = std::shared_ptr<nvinfer1::ILogger>

Public Functions

GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, RawEngine const &rawEngine, LoggerPtr logger = nullptr)

Parameters:

sessionConfig – Configuration of the session,
modelConfig – Description of the model,
worldConfig – Description of the environment,
rawEngine – The compiled TensorRT engine,
logger – The optional logger.

inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)

inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)

inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)

nvinfer1::ILogger &getLogger() const

BufferManager const &getBufferManager() const

BufferManager::CudaStreamPtr getRuntimeStreamPtr() const

inline ModelConfig const &getModelConfig() const

inline WorldConfig const &getWorldConfig() const

inline int getDevice() const noexcept

inline bool getNormalizeLogProbs() const noexcept

nvinfer1::IEngineInspector &getEngineInspector() const

nvinfer1::DataType getLogitDataType() const

void generate(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig, std::shared_ptr<GenerationProfiler> const generationProfiler = nullptr)

This function performs the generation loop.

Given input tensors to read from, output tensors to populate, that member function can be produced or each sequence has reached completion (due to the production will run the generation loop until it reaches the maximum number of tokens that of “end-of-sequence” or a word in the list of “stop words”). The pseudo-code of that function looks like (member function names were changed to keep the presentation simple):

// Have all the sequences in the batch reached completion?
bool allFinished = false;

// Until all sequences are finished or the number of steps reaches the limit...
for (int step = 0; !allFinished && step < maxNewTokens; ++step) {

// Trigger the computation of the logits...
computeLogits(...);

// Run the sampling to produce a token (for each active sequence) from the logits.
allFinished = generateTokensFromLogits(...);

// Callback to stream the output tokens while the generation loop continues.
onTokenGenerated(...);
}

void setLayerProfiler(): Set LayerProfiler to collect performance per layer.

std::string getLayerProfileInfo() const: Print profile information per layer.

Private Types

using KvCacheManager = batch_manager::kv_cache_manager::KVCacheManager

using KvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig

using TensorPtr = runtime::ITensor::SharedPtr 

using TokenGeneratedCallback = std::function<void(SizeType32 step, bool finished)>

Private Functions

inline bool useCudaGraphs()

void generateBatched(std::vector<GenerationOutput> &microBatchesOutputs, std::vector<GenerationInput> const &microBatchesInputs, SamplingConfig const &samplingConfig, TokenGeneratedCallback const &onTokenGenerated, std::shared_ptr<GenerationProfiler> const generationProfiler)

void setup(Config const &sessionConfig)

void createContexts()

void createBuffers(SizeType32 numMicroBatches)

void createDecoders(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType32 numMicroBatches, executor::DecodingMode const &decodingMode)

void createKvCacheManager(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, KvCacheConfig const &config)

void createCustomAllReduceWorkspace(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSequenceLength)

void executeContextStep(std::vector<GenerationInput> const &generationBatchesInputs, std::vector<SizeType32> const &generationBatchesOffsets, KvCacheManager const *kvCacheManager)

SizeType32 executeGenerationStep(SizeType32 step, std::vector<GenerationInput> const &microBatchesInputs, std::vector<GenerationOutput> &microBatchesOutputs, std::vector<SizeType32> const &microBatchOffsets, KvCacheManager *kvCacheManager, std::vector<bool> &microBatchesFinished)

void decoderStepAsync(SizeType32 decoderStep, SizeType32 microBatchId): Execute decoder on last PP rank, receive decoder output on other PP ranks.

bool shouldStopSync(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 microBatchId): Synchronize with the decoder and return the shouldStop flag.

void finalize(SizeType32 microBatchId)

Collect final output ids and log probs on last PP rank and send them to first PP rank.

Receives are asynchronous on host, so synchronization is required before access.

void kvCacheAddSequences(SizeType32 beamWidth, SizeType32 microBatchId, SizeType32 firstBatchIdx)

ITensor::SharedPtr initDecoder(ITensor &outputIds, GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, SizeType32 microBatchId) const: Populate outputIds and return reference to newTokens tensor.

TokenGeneratedCallback createOnTokenGeneratedCallback(GenerationOutput &outputs)

Private Members

ModelConfig const mModelConfig

WorldConfig const mWorldConfig

int mDevice = {-1}

std::shared_ptr<NcclCommunicator> mPipelineComm

std::shared_ptr<CudaStream> mCommStream

CudaEvent mCommEvent = {}

std::shared_ptr<AllReduceBuffers> mAllReduceBuffers

SizeType32 mDecoderMaxSequenceLength = {}

SizeType32 mDecoderMaxAttentionWindow = {}

SizeType32 mDecoderSinkTokenLength = {}

LoggerPtr mLogger

std::shared_ptr<TllmRuntime> mRuntime

std::shared_ptr<KvCacheManager> mKvCacheManager

MicroBatchConfig mMicroBatchConfig

std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders

std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers

std::vector<CudaEvent> mReceivedEvents

bool mCudaGraphMode = {false}

std::vector<CudaGraphExecutor> mCudaGraphInstances

bool mNormalizeLogProbs = true

Friends

friend class batch_manager::TrtGptModelV1

class Config

#include <gptSession.h>

Configuration for session execution and buffer sizes. generate may be called with batch size and beam width smaller than the configured parameters.

maxBatchSize will be divided by the number of micro batches to initialize each batch buffer.

Public Functions

inline Config(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, float gpuWeightsPercent = 1.0)

Public Members

SizeType32 maxBatchSize

SizeType32 maxBeamWidth

SizeType32 maxSequenceLength

float gpuWeightsPercent

bool decoderPerRequest = {false}

bool cudaGraphMode = {false}

KvCacheConfig kvCacheConfig = {}

std::optional<SizeType32> ctxMicroBatchSize = std::nullopt

std::optional<SizeType32> genMicroBatchSize = std::nullopt

std::optional<executor::DecodingMode> decodingMode = std::nullopt

bool normalizeLogProbs = true

class CudaGraphExecutor

Public Functions

CudaGraphExecutor() = default

inline ~CudaGraphExecutor()

inline bool hasInstance()

void clear()

void prepareNextGraph(TllmRuntime const &runtime, SizeType32 nextContextId)

void launch(CudaStream const &stream)

Private Functions

void create(cudaGraph_t const &graph)

bool update(cudaGraph_t const &graph)

void uploadToStream(CudaStream const &stream)

Private Members

cudaGraphExec_t mInstance

class GenerationProfiler

#include <gptSession.h>

Optional profiler class to profile the generation phase of an inference request.

Public Functions

inline GenerationProfiler()

inline CudaEvent const &getStart() const

inline CudaEvent const &getEnd() const

inline float getElapsedTimeMs()

Public Static Attributes

static constexpr unsigned int flags = {cudaEventDefault}

Private Members

CudaEvent start

CudaEvent end

class MicroBatchConfig

Public Functions

inline MicroBatchConfig()

explicit MicroBatchConfig(SizeType32 maxBatchSize, SizeType32 pipelineParallelism, std::optional<SizeType32> genMicroBatchSize, std::optional<SizeType32> ctxMicroBatchSize)

inline constexpr SizeType32 numCtxPerGen() const

inline constexpr SizeType32 getGenGraphId(SizeType32 flipFlopId, SizeType32 generationBatchId) const: flip-flop between 2 graph instances for each generation batch.

Public Members

SizeType32 numCtxBatches

SizeType32 numGenBatches

SizeType32 ctxBatchSize

SizeType32 genBatchSize

namespace utils

Functions

std::vector<uint8_t> loadEngine(std::string const &enginePath)

iBuffer.h

template<> struct MemoryTypeString<MemoryType::kGPU>

Public Static Attributes

static constexpr auto value = "GPU"

template<> struct MemoryTypeString<MemoryType::kCPU>

Public Static Attributes

static constexpr auto value = "CPU"

template<> struct MemoryTypeString<MemoryType::kPINNED>

Public Static Attributes

static constexpr auto value = "PINNED"

template<> struct MemoryTypeString<MemoryType::kUVM>

Public Static Attributes

static constexpr auto value = "UVM"

template<> struct DataTypeTraits<nvinfer1::DataType::kFLOAT>

Public Types

using type = float

Public Static Attributes

static constexpr char name[] = "float"

static constexpr auto size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kHALF>

Public Types

using type = half

Public Static Attributes

static constexpr char name[] = "half"

static constexpr auto size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT8>

Public Types

using type = std::int8_t

Public Static Attributes

static constexpr char name[] = "int8"

static constexpr auto size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT32>

Public Types

using type = std::int32_t

Public Static Attributes

static constexpr char name[] = "int32"

static constexpr auto size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT64>

Public Types

using type = std::int64_t

Public Static Attributes

static constexpr char name[] = "int64"

static constexpr auto size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT32, true>

Public Types

using type = std::uint32_t

Public Static Attributes

static constexpr char name[] = "uint32"

static constexpr auto size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT64, true>

Public Types

using type = std::uint64_t

Public Static Attributes

static constexpr char name[] = "uint64"

static constexpr auto size = sizeof(type)

template<bool kUnsigned> struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned>

Public Types

using type = bool

Public Static Attributes

static constexpr char name[] = "bool"

static constexpr auto size = sizeof(type)

template<bool kUnsigned> struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned>

Public Types

using type = std::uint8_t

Public Static Attributes

static constexpr char name[] = "uint8"

static constexpr auto size = sizeof(type)

template<> struct TRTDataType<std::int8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8

template<> struct TRTDataType<std::int32_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32

template<> struct TRTDataType<std::uint32_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}

template<> struct TRTDataType<std::int64_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64

template<> struct TRTDataType<std::uint64_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}

template<> struct TRTDataType<std::uint8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8

template<> struct TRTDataType<kernels::KVCacheIndex>

Public Static Attributes

static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value

namespace tensorrt_llm

namespace runtime

Typedefs

template<typename T> using PointerElementType = typename std::remove_reference_t<T>::element_type

Enums

enum class MemoryType : std::int32_t

Values:

enumerator kGPU

enumerator kCPU

enumerator kPINNED

enumerator kUVM

Functions

template<typename T> std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::shared_ptr<T> const &ptr) noexcept

template<typename T, typename D> std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::unique_ptr<T, D> &&ptr) noexcept

template<typename T> T const *bufferCast(IBuffer const &buffer)

template<typename T> T *bufferCast(IBuffer &buffer)

std::ostream &operator<<(std::ostream &output, IBuffer const &buffer): Utility function to print a buffer.

template<MemoryType T> struct MemoryTypeString

template<> kGPU >

Public Static Attributes

static constexpr auto value = "GPU"

template<> kCPU >

Public Static Attributes

static constexpr auto value = "CPU"

template<> kPINNED >

Public Static Attributes

static constexpr auto value = "PINNED"

template<> kUVM >

Public Static Attributes

static constexpr auto value = "UVM"

template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false> struct DataTypeTraits: #include <iBuffer.h>

For converting a TensorRT data type to a C++ data type.

template<> kFLOAT >

Public Types

using type = float

Public Static Attributes

static constexpr char name[] = "float"

static constexpr auto size = sizeof(type)

template<> kHALF >

Public Types

using type = half

Public Static Attributes

static constexpr char name[] = "half"

static constexpr auto size = sizeof(type)

template<> kINT8 >

Public Types

using type = std::int8_t

Public Static Attributes

static constexpr char name[] = "int8"

static constexpr auto size = sizeof(type)

template<> kINT32 >

Public Types

using type = std::int32_t

Public Static Attributes

static constexpr char name[] = "int32"

static constexpr auto size = sizeof(type)

template<> kINT64 >

Public Types

using type = std::int64_t

Public Static Attributes

static constexpr char name[] = "int64"

static constexpr auto size = sizeof(type)

template<> kINT32, true >

Public Types

using type = std::uint32_t

Public Static Attributes

static constexpr char name[] = "uint32"

static constexpr auto size = sizeof(type)

template<> kINT64, true >

Public Types

using type = std::uint64_t

Public Static Attributes

static constexpr char name[] = "uint64"

static constexpr auto size = sizeof(type)

template<bool kUnsigned> kBOOL, kUnsigned >

Public Types

using type = bool

Public Static Attributes

static constexpr char name[] = "bool"

static constexpr auto size = sizeof(type)

template<bool kUnsigned> kUINT8, kUnsigned >

Public Types

using type = std::uint8_t

Public Static Attributes

static constexpr char name[] = "uint8"

static constexpr auto size = sizeof(type)

template<nvinfer1::DataType kDataType, bool kUnsigned> struct DataTypeTraits<kDataType, kUnsigned, true>

Public Types

using type = typename DataTypeTraits<kDataType, kUnsigned, false>::type*

Public Static Attributes

static constexpr char name[] = "*"

static constexpr auto size = sizeof(type)

class BufferDataType

#include <iBuffer.h>

A wrapper around nvinfer1::DataType that provides a support for pointer types.

Public Functions

inline constexpr BufferDataType(nvinfer1::DataType dataType, bool _unsigned = false, bool pointer = false)

inline constexpr operator nvinfer1::DataType() const noexcept

inline constexpr nvinfer1::DataType getDataType() const noexcept

inline constexpr bool isPointer() const noexcept

inline constexpr bool isUnsigned() const

inline constexpr std::size_t getSize() const noexcept

Public Static Attributes

static constexpr auto kTrtPointerType = nvinfer1::DataType::kINT64

Private Members

nvinfer1::DataType mDataType

bool mUnsigned

bool mPointer

template<typename T, bool = false> struct TRTDataType: #include <iBuffer.h>

For converting a C++ data type to a TensorRT data type.

template<> struct TRTDataType<float>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kFLOAT

template<> struct TRTDataType<half>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kHALF

template<> int8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8

template<> int32_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32

template<> uint32_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}

template<> int64_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64

template<> uint64_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}

template<> struct TRTDataType<bool>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kBOOL

template<> uint8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8

template<> KVCacheIndex >

Public Static Attributes

static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value

template<> struct TRTDataType<void*>

Public Static Attributes

static constexpr auto value = BufferDataType::kTrtPointerType 

template<typename T> struct TRTDataType<T*>

Public Static Attributes

static constexpr auto value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}

Private Static Attributes

static constexpr auto kUnderlyingType = BufferDataType{TRTDataType<T, false>::value}

class IBuffer

Subclassed by tensorrt_llm::runtime::ITensor

Public Types

using UniquePtr = std::unique_ptr<IBuffer>

using SharedPtr = std::shared_ptr<IBuffer>

using UniqueConstPtr = std::unique_ptr<IBuffer const>

using SharedConstPtr = std::shared_ptr<IBuffer const>

using DataType = nvinfer1::DataType

Public Functions

virtual void *data() = 0: Returns a pointer to underlying array.

virtual void const *data() const = 0: Returns a pointer to underlying array.

inline virtual void *data(std::size_t index): Returns a pointer to the underlying array at a given element index.

inline virtual void const *data(std::size_t index) const: Returns a pointer to the underlying array at a given element index.

virtual std::size_t getSize() const = 0: Returns the size (in number of elements) of the buffer.

inline virtual std::size_t getSizeInBytes() const: Returns the size (in bytes) of the buffer.

virtual std::size_t getCapacity() const = 0: Returns the capacity of the buffer.

virtual DataType getDataType() const = 0: Returns the data type of the buffer.

virtual char const *getDataTypeName() const

virtual MemoryType getMemoryType() const = 0: Returns the memory type of the buffer.

virtual char const *getMemoryTypeName() const

virtual void resize(std::size_t newSize) = 0: Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

virtual void release() = 0: Releases the buffer. It will be reset to nullptr.

virtual ~IBuffer() = default

IBuffer(IBuffer const&) = delete: Not allowed to copy.

IBuffer &operator=(IBuffer const&) = delete: Not allowed to copy.

Public Static Functions

static UniquePtr slice(SharedPtr buffer, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying buffer. The view will have the same data type as buffer.

Parameters:

buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)

static inline UniquePtr slice(SharedPtr buffer, std::size_t offset)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)

static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently resized.

Parameters:: tensor – The tensor to view.
Returns:: A view on the tensor.

static inline UniquePtr view(SharedPtr tensor, std::size_t size)

Returns a view on the underlying tensor with a different size.

Parameters:

tensor – The tensor to view.
size – The size of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr view(TConstPtr &&tensor, std::size_t size)

static UniquePtr wrap(void *data, DataType type, std::size_t size, std::size_t capacity)

Wraps the given data in an IBuffer. The IBuffer will not own the underlying data and cannot be resized beyond capacity.

Parameters:

data – The data to wrap.
type – The data type of the data.
size – The size of the buffer.
capacity – The capacity of the buffer.

Returns:

An IBuffer.

static inline UniquePtr wrap(void *data, DataType type, std::size_t size)

template<typename T> static inline UniquePtr wrap(T *data, std::size_t size, std::size_t capacity)

template<typename T> static inline UniquePtr wrap(T *data, std::size_t size)

template<typename T> static inline UniquePtr wrap(std::vector<T> &v)

static MemoryType memoryType(void const *data): Determine the memory type of a pointer.

Protected Functions

IBuffer() = default

inline std::size_t toBytes(std::size_t size) const: Returns an array index or size in bytes.

template<typename T> class BufferRange : public tensorrt_llm::common::ArrayView<T>

Public Types

using Base = tensorrt_llm::common::ArrayView<T>

Public Functions

inline BufferRange(T *data, size_type size)

template<typename U = T, std::enable_if_t<!std::is_const_v<U>, bool> = true> inline explicit BufferRange(IBuffer &buffer)

template<typename U = T, std::enable_if_t<std::is_const_v<U>, bool> = true> inline explicit BufferRange(IBuffer const &buffer)

iGptDecoderBatch.h

namespace tensorrt_llm

namespace runtime

class IGptDecoderBatch : public virtual tensorrt_llm::runtime::IStatefulGptDecoder 

#include <iGptDecoderBatch.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::GptDecoderBatch

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>

using TensorPtr = std::shared_ptr<ITensor>

using TokenPtr = std::unique_ptr<decoder_batch::Token const>

Public Functions

virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) = 0: Setup buffers for ExplicitDraftTokens decoding.

virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) = 0: Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::Token const &token, decoder_batch::Output &output, decoder_batch::Input const &input) = 0: Call decoder forwardSync and wait for the call to forwardAsync associated with a token to complete.

virtual void forwardSync(decoder_batch::Token const &token) = 0: Wait for the call to forwardAsync associated with a token to complete.

inline virtual void forward(decoder_batch::Output &output, decoder_batch::Input const &input): Run one step for all requests and wait for completion on the host.

virtual TensorPtr getOutputIds(SizeType32 batchIdx) const = 0

Parameters:: batchIdx – index of the batch
Returns:: [maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

virtual CudaEvent finalize(SizeType32 batchIdx, SamplingConfig const &samplingConfig) const = 0: Gather final beam search results for request batchIdx. Result will only be available after event returned.

virtual std::vector<bool> getFinished() const = 0

Returns:: [batchSize (actual)], marks finished requests (per batch)

virtual TensorPtr getCumLogProbs() const = 0

Returns:: [batchSize, beamWidth], cumulative log probabilities (per beam), on gpu

virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const = 0

Returns:: [beamWidth], cumulative log probabilities (per beam) for request batchIdx, on gpu

virtual TensorPtr getLogProbs() const = 0

Returns:: [batchSize, beamWidth, maxSeqLen], log probabilities (per beam), on gpu

virtual TensorPtr getLogProbs(SizeType32 batchIdx) const = 0

Returns:: [beamWidth, maxSeqLen], cumulative log probabilities (per beam) for request batchIdx, on gpu

virtual TensorPtr getParentIds() const = 0

virtual std::vector<SizeType32> getNbSteps() const = 0

virtual executor::DecodingMode getDecodingMode() const = 0

virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs) = 0: Initialize batched decoder at seqSlots with a new requests.

virtual TensorPtr getNextDraftTokens() const = 0

Returns:: [batchSize, maxTokensPerStep-1], predicted draft tokens for next step, on gpu

virtual TensorPtr getPrevDraftTokensLengths() const = 0

Returns:: [batchSize], predicted draft tokens lengths for previous step, on gpu

virtual TensorPtr getNextDraftTokensLengths() const = 0

Returns:: [batchSize], predicted draft tokens lengths for next step, on gpu

virtual TensorPtr getAcceptedLengthsCumSum() const = 0

Returns:: [batchSize + 1], exclusive sum of accepted draft token lengths, on gpu

virtual TensorPtr getAcceptedPackedPaths() const = 0

Returns:: [batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu

Protected Functions

IGptDecoderBatch() = default

namespace decoder_batch

Typedefs

using Output = decoder::Output 

class Input

Public Types

using TensorConstPtr = ITensor::SharedConstPtr 

using TensorPtr = ITensor::SharedPtr 

Public Functions

inline explicit Input(std::vector<TensorConstPtr> const &logits, std::vector<bool> const &active)

inline explicit Input(std::vector<TensorConstPtr> const &logits)

inline explicit Input(std::vector<TensorPtr> const &logits, std::vector<bool> const &active)

inline explicit Input(std::vector<TensorPtr> const &logits)

Public Members

std::vector<TensorConstPtr> logits

std::vector<bool> active

TensorConstPtr cacheIndirection

std::vector<std::vector<TensorConstPtr>> predictedDraftLogits

TensorPtr seqSlots

std::optional<ExplicitDraftTokensBuffers::EngineOutputs> explicitDraftTokensInputs

std::optional<ExplicitDraftTokensBuffers::EngineInputs> explicitDraftTokensLastInputs

class Token

Public Functions

inline explicit Token(CudaEvent &&event, std::vector<bool> const &active)

Public Members

CudaEvent event

std::vector<bool> active

iStatefulGptDecoder.h

namespace tensorrt_llm

namespace runtime

class IStatefulGptDecoder

#include <iStatefulGptDecoder.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::IGptDecoderBatch

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, bool fusedDecoder, nvinfer1::DataType dtype, ModelConfig const &modelConfig) = 0: Setup the decoder before calling forward(), also calls reshapeBuffers.

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) = 0: Initialize the decoder with new batch of inputs.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0: Run one step for all requests without blocking the host thread.

virtual void forwardSync() = 0: Wait for the last call to forwardAsync to complete.

inline virtual void forward(decoder::Output &output, decoder::Input const &input): Run one step for all requests.

virtual void finalize() const = 0: Gather final beam search results for all requests.

virtual TensorPtr getOutputIds() const = 0

Returns:: [batchSize, beamWidth, maxSequenceLength], all token ids, on gpu

virtual TensorPtr getCumLogProbs() const = 0

Returns:: [batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

virtual TensorPtr getLogProbs() const = 0

Returns:: [batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

virtual TensorPtr getNewTokens(SizeType32 iter = 0) const = 0

Get tokens generated in one step of last forward pass.

Parameters:: iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
Returns:: [batchSize, beamWidth], tokens generated in iter (per beam), on gpu

virtual TensorPtr getAllNewTokens() const = 0

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:: [maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

virtual TensorPtr getNbFinished() const = 0

Returns:: [1], number of finished sequences, in pinned host memory

virtual ~IStatefulGptDecoder() = default

Protected Functions

IStatefulGptDecoder() = default

namespace decoder

class Input

Public Types

using TensorPtr = std::shared_ptr<ITensor const>

Public Functions

inline explicit Input(TensorPtr logits)

Public Members

TensorPtr logits

TensorPtr cacheIndirection

class Output

Public Types

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

Output() = default

Public Members

TensorPtr cacheIndirection

TensorPtr sequenceLengths

iTensor.h

namespace nvinfer1

namespace tensorrt_llm

namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims): Utility function to print a shape.

std::ostream &operator<<(std::ostream &output, ITensor const &tensor): Utility function to print a tensor with its shape.

class ITensor : public virtual tensorrt_llm::runtime::IBuffer 

Public Types

using UniquePtr = std::unique_ptr<ITensor>

using SharedPtr = std::shared_ptr<ITensor>

using UniqueConstPtr = std::unique_ptr<ITensor const>

using SharedConstPtr = std::shared_ptr<ITensor const>

using Shape = nvinfer1::Dims

using DimType64 = std::remove_reference_t<decltype(Shape::d[0])>

Public Functions

~ITensor() override = default

virtual Shape const &getShape() const = 0: Returns the tensor dimensions.

virtual void reshape(Shape const &dims) = 0: Sets the tensor dimensions. The new size of the tensor will be volume(dims)

inline virtual void resize(std::size_t newSize) override: Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

ITensor(ITensor const&) = delete: Not allowed to copy.

ITensor &operator=(ITensor const&) = delete: Not allowed to copy.

inline void squeeze(SizeType32 dim): Removes the given unit dimensions from this tensor.

inline void unsqueeze(SizeType32 dim): Adds a unit dimension at the specified position.

inline bool shapeEquals(Shape const &other) const

inline bool shapeEquals(std::initializer_list<SizeType32> const &other) const

template<typename T> inline bool shapeEquals(T const *dims, SizeType32 count) const

Public Static Functions

static inline std::int64_t volume(Shape const &dims): Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.

static inline std::size_t volumeNonNegative(Shape const &shape): Returns the volume of the dimensions. Throws if d.nbDims < 0.

static inline Shape strides(Shape const &dims): Returns the strides of each dimemsion in a Shape.

static Shape squeeze(Shape const &shape, SizeType32 dim)

Removes the given unit dimension from shape.

Parameters:

shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).

Returns:

A new shape without the unit dimension.

static Shape unsqueeze(Shape const &shape, SizeType32 dim)

Add a unit dimension to shape at the specified position.

Parameters:

shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.

Returns:

A new shape with the added unit dimension.

static UniquePtr slice(SharedPtr tensor, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.

Parameters:

tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)

static inline UniquePtr slice(SharedPtr tensor, std::size_t offset)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)

static UniquePtr slice(SharedPtr tensor, Shape const &offsetDims, DimType64 size)

Parameters:

offsetDims – The offset in multiple dimensions.
tensor – The tensor to view.
offsetDims – The offset dimensions of the view.
size – The size of the view w.r.t. the last dimension in offsetDims.
offsetDims – specifies all dimensions.

Throws:

Whenever – offset overflows or the last dimension offset+size overflows.

Returns:

A view of shape [size, the rest dimensions] or [size] when

static inline UniquePtr slice(SharedPtr tensor, std::initializer_list<DimType64> const &offsetDims, DimType64 size)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims, std::size_t size)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims, std::size_t size)

static inline UniquePtr slice(SharedPtr tensor, Shape const &offsetDims): return the rest slices at the last dimension when size omitted.

static inline UniquePtr slice(SharedPtr tensor, std::initializer_list<DimType64> const &offsetDims)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)

static inline UniquePtr at(SharedPtr tensor, Shape const &offsetDims)

Parameters:: offsetDims – specifies all dimensions.
Returns:: Just the block at the point, with shape of [the rest dimensions] or [1] when

static inline UniquePtr at(SharedPtr tensor, std::initializer_list<DimType64> const &offsetDims)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr at(TConstPtr &&tensor, Shape const &offsetDims)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline ITensor::UniqueConstPtr at(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)

static UniquePtr view(IBuffer::SharedPtr buffer, Shape const &dims)

Returns a view on the underlying buffer (or tensor) with the given shape.

Parameters:

tensor – The tensor to view.
shape – The shape of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)

static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently reshaped.

Parameters:: tensor – The tensor to view.
Returns:: A view on the tensor.

static inline UniquePtr flattenN(SharedPtr tensor, std::int64_t sliceN = -1)

Returns a flattened view on the underlying tensor which can be independently reshaped.

Parameters:

tensor – The tensor to flatten.
sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.

Returns:

A flatten view on the tensor.

static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)

Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.

Parameters:

data – The data to wrap.
type – The data type of the data.
shape – The shape of the tensor.
capacity – The capacity of the buffer.

Returns:

An ITensor.

static inline UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape)

template<typename T> static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)

template<typename T> static inline UniquePtr wrap(T *data, Shape const &shape)

template<typename T> static inline UniquePtr wrap(std::vector<T> &v, Shape const &shape)

static Shape makeShape(std::initializer_list<DimType64> const &dims): A convenience function to create a tensor shape with the given dimensions.

static std::string toString(Shape const &dims): A convenience function for converting a tensor shape to a string.

static inline bool shapeEquals(Shape const &lhs, Shape const &rhs): A convenience function to compare shapes.

template<typename T> static inline bool shapeEquals(Shape const &lhs, T const *dims, SizeType32 count): A convenience function to compare shapes.

Protected Functions

ITensor() = default

Protected Static Functions

static inline DimType64 castSize(size_t newSize)

ipcUtils.h

namespace tensorrt_llm

namespace runtime

class IpcMemory

Public Types

using BufferPtr = IBuffer::SharedPtr 

Public Functions

IpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig)

~IpcMemory()

IpcMemory(IpcMemory const&) = delete

IpcMemory &operator=(IpcMemory const&) = delete

IpcMemory(IpcMemory&&) = default

IpcMemory &operator=(IpcMemory&&) = default

inline std::vector<void*> const &getCommPtrs() const

Public Static Attributes

static constexpr size_t FLAGS_SIZE = (kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)

Private Functions

void allocateIpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig)

void destroyIpcMemory()

Private Members

SizeType32 mTpRank

std::vector<void*> mCommPtrs

BufferPtr mBuffer

bool mOpenIpc

class AllReduceBuffers

Public Types

using TensorPtr = ITensor::SharedPtr 

Public Functions

AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, SizeType32 hiddenSize, BufferManager const &manager, WorldConfig const &worldConfig)

Public Members

TensorPtr mAllReduceCommPtrs

std::vector<runtime::IpcMemory> mIpcMemoryHandles

lookaheadModule.h

namespace tensorrt_llm

namespace runtime

class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule 

Public Functions

inline explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept

inline explicit LookaheadModule() noexcept

inline void setExecutionConfig(executor::LookaheadDecodingConfig const &config)

inline executor::LookaheadDecodingConfig const getExecutionConfig() const

Private Members

executor::LookaheadDecodingConfig mExecutionConfig

loraCache.h

namespace tensorrt_llm

namespace runtime

Functions

std::string to_string(LoraCache::TaskLayerModuleConfig const &v)

std::ostream &operator<<(std::ostream &os, LoraCache::TaskLayerModuleConfig const &v)

class LoraExpectedException : public std::runtime_error

Subclassed by tensorrt_llm::runtime::LoraCacheFullException

Public Functions

explicit LoraExpectedException(std::string const &msg)

~LoraExpectedException() noexcept override

class LoraCacheFullException : public tensorrt_llm::runtime::LoraExpectedException 

Public Functions

explicit LoraCacheFullException(std::string const &msg)

~LoraCacheFullException() noexcept override

class LoraCachePageManager

#include <loraCache.h>

Holds memory of lora cache pages, and manages allocation and freeing of whole pages. Memory is pre-allocated either on the host or device

Note that this class is not thread safe

Public Types

using TensorPtr = ITensor::SharedPtr 

Public Functions

LoraCachePageManager(LoraCachePageManagerConfig const &config, BufferManager const &bufferManager)

Parameters:

config – [in] a LoraCachePageManagerConfig
bufferManager – [in] a Buffermanager used to allocate page blocks

std::optional<std::vector<std::size_t>> claimPages(SizeType32 numPages)

claim pages

Parameters:: numPages – [in] number of pages to claim
Returns:: a tuple, where the first values is a boolean indicating whether pages were claimed. If the first value is true the second value will have a list of pageIds

SizeType32 numAvailablePages() const

get number of available (free) pages in manager

Returns:: number of free pages in manager

void releasePages(std::vector<std::size_t> const &pages)

release given pages

Parameters:: pages – [in] list of pages to release (free)

ITensor::SharedConstPtr blockPtr(SizeType32 blockIdx) const

return pointer to given page block

Parameters:: blockIdx; – [in]
Returns:: — pointer to page block

ITensor::SharedConstPtr pagePtr(std::size_t pageIdx) const

return pointer to given page

Parameters:: pageIdx – [in]
Returns:: — const pointer to page

ITensor::SharedPtr mutablePagePtr(std::size_t pageIdx)

return pointer to given page

Parameters:: pageIdx – [in]
Returns:: — mutable pointer to page

Private Functions

void initialize(BufferManager const &bufferManager)

Private Members

std::vector<TensorPtr> mPageBlocks

std::deque<std::size_t> mFreePageIds

std::vector<std::uint8_t> mIsPageFree

LoraCachePageManagerConfig const mConfig

class LoraCache

#include <loraCache.h>

LoraCache

Caches LoRA weights with LRU eviction policy.

Tasks put in the cache are marked in progress and can not be evicted, until they are marked done.

A cache page holds a optimally sized LoRA. A page is of size [numSlots x pageWidth] An optimally size LoRA is on that has the configured optimalAdapterSize.

Conceptually a slot corresponds to a r=1, 1-layer, 1-module set of in/out weights. Page width is set to the number of weights in smallest module.

The number of slots per page is then ceilDiv(num weights in optimally sized LoRA, num weights in smallest module)

Cache pages are allocated on one or more blocks

Public Types

using TensorPtr = ITensor::SharedPtr 

using TaskIdType = std::uint64_t

using TaskLayerModuleConfigListPtr = std::shared_ptr<std::vector<TaskLayerModuleConfig>>

Public Functions

LoraCache(LoraCachePageManagerConfig const &pageManagerConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, BufferManager const &bufferManager): param[in] pageManagerConfig: a LoraCachePageManagerConfig param[in] modelConfig: a ModelConfig param[in] worldConfig: a WorldConfig param[in] bufferManager: a BufferManager only used to allocate page blocks

void put(TaskIdType taskId, TensorPtr weights, TensorPtr config, bool load = true)

put a task in the cache, and claim pages for it, and optionally load task weights.

Parameters:

taskId – [in] the task id
weights – [in] lora weights tensor
config – [in] lora config tensor
load – [in] if true load weights before returning, otherwise do not

void loadWeights(TaskIdType taskId, TensorPtr weights, TensorPtr config)

load task weights. This method must be called after put. It is designed to be called asynchronously after put returns with load = false

Parameters:

taslId – [in] the task id
weights – [in] lora weights tensor
config – [in] lora config tensor

inline bool isLoaded(TaskIdType taskId) const

Parameters:: taskId – [in] the task id
Returns:: — true if task is loaded (weights are in place) and false otherwise

bool isDone(TaskIdType taskId) const

Parameters:: taskId – [in] the task id
Returns:: — true if task is marked done and can be evicted

inline bool has(TaskIdType taskId) const

Parameters:: taskId – [in] the task id
Returns:: — true if task is in the cache (not necessarily loaded) and false otherwise

std::shared_ptr<std::vector<TaskLayerModuleConfig>> get(TaskIdType taskId)

Parameters:: taskId – [in] the task id
Returns:: — list of Value objects with pointers to task weights

void bump(TaskIdType taskId)

bump task and make it the most recently used

Parameters:: taskId – [in] the task id

void markTaskDone(TaskIdType taskId)

mark task done meaning it can be evicted

Parameters:: taskId – [in] the task id

void markAllDone(): mark all tasks in cache done

SizeType32 determineNumPages(TaskIdType taskId) const

Parameters:: taskId – [in] the taskid
Returns:: — number of pages needed to store the given task

SizeType32 determineNumPages(TensorPtr config) const

Parameters:: config – [in] lora config tensor
Returns:: — number of pages needed to store the task configured with config tensor

bool fits(TensorPtr config) const

Parameters:: config – [in] a lora config tensor
Returns:: — true in task fits in cache false otherwise

void copyTask(TaskIdType taskId, LoraCache &deviceCache, bool markDone = false)

copy task to another cache. Caches must have the same page size.

Parameters:

taskId – [in] the task id to copy
otherCache – [in] the LoraCache to move the task to
markDone – [in] mark the copied task done as it’s copied

SizeType32 getNumPages() const

Returns:: — total number of pages allocated to cache (used or not)

ITensor::SharedConstPtr getPagePtr(size_t pageId) const

Parameters:: pageId – [in] the page id
Returns:: — const pointer to page

Public Static Functions

static std::vector<LoraCache::TaskLayerModuleConfig> copyToPages(TensorPtr weights, TensorPtr config, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::unordered_map<SizeType32, LoraModule> moduleIdToModel, BufferManager const &manager, std::vector<TensorPtr> const &pages, std::vector<std::size_t> const &pageIds)

Copy task weights to cache pages.

Parameters:

weights – [in] task weights
config – [in] task config tensor
modelConfig – [in] a ModelConfig
worldConfig – [in] a WorldConfig
modelIdToModel – [in] map from lora module id to LoraModule
manager – [in] a BufferManager the manager to use to perform the copies
pages – [out] list of page tensors to copy weights to
pageIds – [in] page ids for the pages

Returns:

— list of cache Values objects

static void splitTransposeCpu(ITensor &output, ITensor const &input, SizeType32 tpSize, SizeType32 tpRank)

splits second dim of input into tpSize parts and writes the tpRank split to output

Parameters:

output – [out] output tensor
input – [in] input tensor
tpSize – [in] number of splits
tpRank – [in] the split to write to output

Private Types

enum ValueStatus

Values:

enumerator kVALUE_STATUS_MISSING

enumerator kVALUE_STATUS_PROCESSING

enumerator kVALUE_STATUS_LOADED

using TaskValuePtr = std::shared_ptr<TaskValue>

Private Functions

void loadWeights(TaskValue &cacheValue, TensorPtr weights, TensorPtr config)

void bumpTaskInProgress(TaskIdType taskId)

ValueStatus getStatus(TaskIdType taskId) const

std::vector<std::size_t> claimPagesWithEvict(SizeType32 numPages)

claim numPages, evicting tasks if needed

Parameters:: numPages – [in] number of pages to claim
Throws:: std::runtime_error – if all pages cannot be claimed
Returns:: — list of page ids

std::map<size_t, std::pair<size_t, SizeType32>> copyTaskMapPages(TaskValue &targetTaskValue, TaskValue const &sourceTaskValue, std::vector<size_t> const &targetPageIds, LoraCache const &targetCache): Internal helper method used inside copyTask. Not thread safe on its own

Private Members

LoraCachePageManagerConfig mPageManagerConfig

ModelConfig mModelConfig

WorldConfig mWorldConfig

mutable std::mutex mPagesMutex

std::unique_ptr<LoraCachePageManager> mCachePageManager

mutable std::mutex mCacheMutex

std::unordered_map<TaskIdType, TaskValuePtr> mCacheMap

std::list<TaskIdType> mInProgressTasks

std::list<TaskIdType> mDoneTasks

std::vector<std::unique_ptr<BufferManager>> mDeviceBufferManagers

std::unique_ptr<BufferManager> mBufferManager

std::unordered_map<SizeType32, LoraModule> mModuleIdToModule

Private Static Functions

template<typename T> static void splitTransposeCpuInner(ITensor &output, ITensor const &input, SizeType32 tpSize, SizeType32 tpRank)

struct TaskLayerModuleConfig

#include <loraCache.h>

Contains information on a single layer / module. A list of these configs is associated with each task and can be used to populate runtime tensors.

Public Functions

std::string toString() const

bool operator==(LoraCache::TaskLayerModuleConfig const &o) const

Public Members

std::size_t pageId

SizeType32 slotIdx

SizeType32 inSize

SizeType32 outSize

SizeType32 moduleId

SizeType32 layerId

SizeType32 adapterSize

SizeType32 numSlots

std::int64_t weightsInPointer

std::int64_t weightsOutPointer

struct TaskValue

Holds configuration and state for a single task.

Public Functions

TaskValue() = delete

~TaskValue() = default

inline TaskValue(std::vector<std::size_t> const &pageIds, TaskLayerModuleConfigListPtr const &configs, std::list<TaskIdType>::iterator it, bool inProgress, bool loaded, bool done, bool loadInProgress = false)

inline TaskValue(TaskValue &&o) noexcept

inline TaskValue &operator=(TaskValue &&o)

Public Members

std::vector<std::size_t> pageIds

TaskLayerModuleConfigListPtr configs

std::list<TaskIdType>::iterator it

bool inProgress

bool loaded

bool done: Marks a task a done. This is used to mark a task as done during loading. if done=true at the end of loading (end of put, loadweights, or copyTask) the task will be marked as done

bool loadInProgress: Indicates weights are loading either in put or loadWeights This is used to block concurrent loadWeights calls for the same task.

loraCachePageManagerConfig.h

namespace tensorrt_llm

namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &os, LoraCachePageManagerConfig const &c)

inline std::string to_string(LoraCachePageManagerConfig const &c)

class LoraCachePageManagerConfig

#include <loraCachePageManagerConfig.h>

Configuration for LoraCachePageManager

See LoraCache docs for description of pages, slots, and page blocks.

Public Functions

inline explicit constexpr LoraCachePageManagerConfig(runtime::MemoryType memType, nvinfer1::DataType dType, SizeType32 totalNumPages, SizeType32 maxPagesPerBlock, SizeType32 slotsPerPage, SizeType32 pageWidth, SizeType32 numCopyStreams)

inline constexpr runtime::MemoryType getMemoryType() const noexcept

inline constexpr void setMemoryType(runtime::MemoryType const &memoryType) noexcept

inline constexpr nvinfer1::DataType getDataType() const noexcept

inline constexpr void setDataType(nvinfer1::DataType const &dtype) noexcept

inline constexpr SizeType32 getTotalNumPages() const noexcept

inline constexpr void setTotalNumPage(SizeType32 const &totalNumPages) noexcept

inline constexpr SizeType32 getMaxPagesPerBlock() const noexcept

inline constexpr void setMaxPagesPerBlock(SizeType32 const &maxPagesPerBlock) noexcept

inline constexpr SizeType32 getSlotsPerPage() const noexcept

inline constexpr void setSlotsPerPage(SizeType32 const &slotsPerPage) noexcept

inline constexpr SizeType32 getPageWidth() const noexcept

inline constexpr void setPageWidth(SizeType32 const &pageWidth) noexcept

inline constexpr bool getInitToZero() const noexcept

inline constexpr void setInitToZero(bool initToZero) noexcept

inline constexpr SizeType32 getNumCopyStreams() const noexcept

inline constexpr void setNumCopyStreams(SizeType32 numCopyStreams) noexcept

Private Members

runtime::MemoryType mMemoryType

nvinfer1::DataType mDataType

SizeType32 mTotalNumPages

SizeType32 mMaxPagesPerBlock

SizeType32 mSlotsPerPage

SizeType32 mPageWidth

SizeType32 mNumCopyStreams = 1

bool mInitToZero

loraModule.h

namespace tensorrt_llm

namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &output, LoraModule const &module)

class LoraModule

Public Types

enum class ModuleType : SizeType32 

Values:

enumerator kINVALID

enumerator kATTN_QKV

enumerator kATTN_Q

enumerator kATTN_K

enumerator kATTN_V

enumerator kATTN_DENSE

enumerator kMLP_H_TO_4H

enumerator kMLP_4H_TO_H

enumerator kMLP_GATE

enumerator kCROSS_ATTN_QKV

enumerator kCROSS_ATTN_Q

enumerator kCROSS_ATTN_K

enumerator kCROSS_ATTN_V

enumerator kCROSS_ATTN_DENSE

enumerator kMOE_H_TO_4H

enumerator kMOE_4H_TO_H

enumerator kMOE_GATE

enumerator kMOE_ROUTER

using TensorPtr = ITensor::SharedPtr 

Public Functions

inline explicit constexpr LoraModule(ModuleType const &t, SizeType32 inDim, SizeType32 outDim, bool inDimFirst, bool outDimFirst, SizeType32 inTpSplitDim, SizeType32 outTpSplitDim) noexcept

inline explicit constexpr LoraModule() noexcept

explicit constexpr LoraModule(LoraModule const &o) = default

constexpr LoraModule &operator=(LoraModule const &o) = default

inline constexpr SizeType32 flattenedInOutSize(SizeType32 adapterSize) const noexcept

inline constexpr SizeType32 inSize(SizeType32 adapterSize) const noexcept

inline constexpr SizeType32 outSize(SizeType32 adapterSize) const noexcept

inline constexpr SizeType32 localInSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept

inline constexpr SizeType32 localOutSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept

inline constexpr SizeType32 localInDim(SizeType32 tpSize) const noexcept

inline constexpr SizeType32 localOutDim(SizeType32 tpSize) const noexcept

inline constexpr SizeType32 localInAdapterSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept

inline constexpr SizeType32 localOutAdapterSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept

inline constexpr SizeType32 localInOutSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept

inline constexpr SizeType32 value() const noexcept

inline constexpr std::string_view name() const noexcept

inline constexpr SizeType32 inDim() const noexcept

inline constexpr SizeType32 outDim() const noexcept

inline constexpr bool inDimFirst() const noexcept

inline constexpr bool outDimFirst() const noexcept

inline constexpr SizeType32 inTpSplitDim() const noexcept

inline constexpr SizeType32 outTpSplitDim() const noexcept

Public Static Functions

static std::vector<LoraModule> createLoraModules(std::vector<std::string> const &loraModuleNames, SizeType32 hiddenSize, SizeType32 mlpHiddenSize, SizeType32 numAttentionHeads, SizeType32 numKvAttentionHeads, SizeType32 attentionHeadSize, SizeType32 tpSize)

static inline constexpr ModuleType toModuleType(std::string_view const &name)

static inline constexpr std::string_view toModuleName(ModuleType t) noexcept

static inline constexpr std::string_view toModuleName(SizeType32 id)

Private Members

ModuleType mType

SizeType32 mInDim

SizeType32 mOutDim

bool mInDimFirst

bool mOutDimFirst

SizeType32 mInTpSplitDim

SizeType32 mOutTpSplitDim

medusaModule.h

namespace tensorrt_llm

namespace runtime

class MedusaModule : public tensorrt_llm::runtime::SpeculativeDecodingModule 

Public Types

using TensorPtr = ITensor::SharedPtr 

using MedusaChoices = std::vector<std::vector<SizeType32>>

Public Functions

inline explicit MedusaModule(SizeType32 maxAcceptedTokens, SizeType32 maxDraftTokens) noexcept

inline explicit MedusaModule() noexcept

inline MedusaChoices const &getMedusaChoices() const noexcept

void initMedusaTensorsFromChoices(MedusaChoices const &choices, std::vector<SizeType32> &topKs, TensorPtr &generationInputLengths, TensorPtr &positionOffsets, TensorPtr &treeIds, TensorPtr &paths, TensorPtr &packedMask, SizeType32 &totalPaths) const noexcept

Private Types

using Prefix = uint64_t

Private Functions

SizeType32 computePathsAndMask(std::vector<MedusaTreeNode> const &tree, TensorPtr &packedMask, TensorPtr &paths) const

void copyPackedMask(TensorPtr &mask, SizeType32 srcIdx, SizeType32 dstIdx) const

void setOnePackedMask(TensorPtr &mask, SizeType32 row, SizeType32 col) const

Prefix computePrefix(std::vector<SizeType32> const &vec, SizeType32 len) const

void dumpChoices(MedusaChoices const &choices, std::vector<SizeType32> const &indices) const

Private Members

MedusaChoices mDefaultMedusaChoices = {{0}, {0, 0}, {1}, {0, 1}, {2}, {0, 0, 0}, {1, 0}, {0, 2}, {3}, {0, 3}, {4}, {0, 4}, {2, 0}, {0, 5}, {0, 0, 1}, {5}, {0, 6}, {6}, {0, 7}, {0, 1, 0}, {1, 1}, {7}, {0, 8}, {0, 0, 2}, {3, 0}, {0, 9}, {8}, {9}, {1, 0, 0}, {0, 2, 0}, {1, 2}, {0, 0, 3}, {4, 0}, {2, 1}, {0, 0, 4}, {0, 0, 5}, {0, 0, 0, 0}, {0, 1, 1}, {0, 0, 6}, {0, 3, 0}, {5, 0}, {1, 3}, {0, 0, 7}, {0, 0, 8}, {0, 0, 9}, {6, 0}, {0, 4, 0}, {1, 4}, {7, 0}, {0, 1, 2}, {2, 0, 0}, {3, 1}, {2, 2}, {8, 0}, {0, 5, 0}, {1, 5}, {1, 0, 1}, {0, 2, 1}, {9, 0}, {0, 6, 0}, {0, 0, 0, 1}, {1, 6}, {0, 7, 0}}

Private Static Attributes

static constexpr SizeType32 PREFIX_CHUNK_SIZE_BITS = 4

static constexpr SizeType32 PREFIX_MAX_VALUE = 16

struct MedusaTreeNode

Public Members

SizeType32 nodeId

SizeType32 depth

SizeType32 parentLinearIdx

SizeType32 linearIdx

std::vector<SizeType32> childLinearIndices

memoryCounters.h

namespace tensorrt_llm

namespace runtime

class MemoryCounters

Public Types

using SizeType32 = std::size_t

using DiffType = std::ptrdiff_t

Public Functions

MemoryCounters() = default

inline SizeType32 getGpu() const

inline SizeType32 getCpu() const

inline SizeType32 getPinned() const

inline SizeType32 getUVM() const

inline DiffType getGpuDiff() const

inline DiffType getCpuDiff() const

inline DiffType getPinnedDiff() const

inline DiffType getUVMDiff() const

template<MemoryType T> inline void allocate(SizeType32 size)

void allocate(MemoryType memoryType, SizeType32 size)

template<MemoryType T> inline void deallocate(SizeType32 size)

void deallocate(MemoryType memoryType, SizeType32 size)

std::string toString() const

Public Static Functions

static MemoryCounters &getInstance()

static std::string bytesToString(SizeType32 bytes, int precision = 2)

static std::string bytesToString(DiffType bytes, int precision = 2)

Private Members

std::atomic<SizeType32> mGpu = {}

std::atomic<SizeType32> mCpu = {}

std::atomic<SizeType32> mPinned = {}

std::atomic<SizeType32> mUVM = {}

std::atomic<DiffType> mGpuDiff = {}

std::atomic<DiffType> mCpuDiff = {}

std::atomic<DiffType> mPinnedDiff = {}

std::atomic<DiffType> mUVMDiff = {}

modelConfig.h

namespace tensorrt_llm

namespace runtime

class ModelConfig

Public Types

enum class ModelVariant : std::int32_t

Values:

enumerator kGpt

enumerator kGlm

enumerator kMamba

enumerator kRecurrentGemma

enumerator kEncDec

enum class LayerType : std::int32_t

Values:

enumerator kATTENTION

enumerator kRECURRENT

Public Functions

inline explicit ModelConfig(SizeType32 vocabSize, SizeType32 nbAttentionLayers, SizeType32 nbRnnLayers, SizeType32 nbHeads, SizeType32 hiddenSize, nvinfer1::DataType dtype)

inline constexpr SizeType32 getVocabSize() const noexcept

inline constexpr SizeType32 getVocabSizePadded(SizeType32 worldSize) const noexcept

inline constexpr SizeType32 getNbAttentionLayers(SizeType32 pipelineParallelism = 1) const

inline constexpr SizeType32 getNbRnnLayers(SizeType32 pipelineParallelism = 1) const

inline constexpr SizeType32 getNbHeads() const noexcept

inline constexpr SizeType32 getNbKvHeads() const noexcept

inline constexpr void setNbKvHeads(SizeType32 nbKvHeads) noexcept

inline constexpr SizeType32 getHiddenSize() const noexcept

inline constexpr SizeType32 getEncoderHiddenSize() const noexcept

inline constexpr void setEncoderHiddenSize(SizeType32 encoderHiddenSize) noexcept

inline constexpr SizeType32 getSizePerHead() const noexcept

inline constexpr void setSizePerHead(SizeType32 sizePerHead) noexcept

inline constexpr nvinfer1::DataType getDataType() const noexcept

inline constexpr bool useGptAttentionPlugin() const noexcept

inline constexpr void useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept

inline constexpr bool useMambaConv1dPlugin() const noexcept

inline constexpr void useMambaConv1dPlugin(bool useMambaConv1dPlugin) noexcept

inline constexpr bool usePackedInput() const noexcept

inline constexpr void usePackedInput(bool inputPacked) noexcept

inline constexpr bool usePagedKvCache() const noexcept

inline constexpr void usePagedKvCache(bool pagedKvCache) noexcept

inline constexpr bool usePagedState() const noexcept

inline constexpr void usePagedState(bool pagedState) noexcept

inline constexpr SizeType32 getTokensPerBlock() const noexcept

inline constexpr void setTokensPerBlock(SizeType32 TokensPerBlock) noexcept

inline constexpr common::QuantMode getQuantMode() const noexcept

inline constexpr void setQuantMode(common::QuantMode QuantMode) noexcept

inline constexpr bool supportsInflightBatching() const noexcept

inline constexpr SizeType32 getMaxBatchSize() const noexcept

inline constexpr void setMaxBatchSize(SizeType32 maxBatchSize) noexcept

inline constexpr SizeType32 getMaxBeamWidth() const noexcept

inline constexpr void setMaxBeamWidth(SizeType32 maxBeamWidth) noexcept

inline constexpr SizeType32 getMaxInputLen() const noexcept

inline constexpr void setMaxInputLen(SizeType32 maxInputLen) noexcept

inline constexpr SizeType32 getMaxSequenceLen() const noexcept

inline constexpr void setMaxSequenceLen(SizeType32 maxSequenceLen) noexcept

inline constexpr std::optional<SizeType32> getMaxNumTokens() const noexcept

inline constexpr void setMaxNumTokens(std::optional<SizeType32> maxNumTokens) noexcept

inline constexpr SizeType32 getMaxEncoderLen() const noexcept

inline constexpr void setMaxEncoderLen(SizeType32 maxEncoderLen) noexcept

inline constexpr bool usePromptTuning() const noexcept

inline constexpr SizeType32 getMaxPromptEmbeddingTableSize() const noexcept

inline constexpr void setMaxPromptEmbeddingTableSize(SizeType32 maxPromptEmbeddingTableSize) noexcept

inline constexpr bool computeContextLogits() const noexcept

inline constexpr void computeContextLogits(bool computeContextLogits) noexcept

inline constexpr bool computeGenerationLogits() const noexcept

inline constexpr void computeGenerationLogits(bool computeGenerationLogits) noexcept

inline ModelVariant getModelVariant() const

inline void setModelVariant(ModelVariant modelVariant)

inline constexpr bool useCustomAllReduce() const noexcept

inline constexpr void useCustomAllReduce(bool customAllReduce) noexcept

inline SizeType32 getMaxDecodingDraftTokens() const

inline constexpr SizeType32 getMaxDecodingTokens() const noexcept

inline constexpr void setContextFMHA(bool contextFMHA) noexcept

inline constexpr bool getContextFMHA() const noexcept

inline constexpr void setPagedContextFMHA(bool pagedContextFMHA) noexcept

inline constexpr bool getPagedContextFMHA() const noexcept

inline constexpr void useXQA(bool useXQA) noexcept

inline constexpr bool useXQA() const noexcept

inline constexpr bool useLoraPlugin() const noexcept

inline constexpr void useLoraPlugin(bool useLoraPlugin) noexcept

inline std::vector<LoraModule> const &getLoraModules() const noexcept

inline void setLoraModules(std::vector<LoraModule> const &loraModules) noexcept

inline constexpr SizeType32 getMlpHiddenSize() const noexcept

inline constexpr void setMlpHiddenSize(SizeType32 mlpHiddenSize) noexcept

inline constexpr bool useCrossAttention() const noexcept

inline constexpr void setUseCrossAttention(bool useCrossAttention) noexcept

inline constexpr bool usePositionEmbedding() const noexcept

inline constexpr void setUsePositionEmbedding(bool usePositionEmbedding) noexcept

inline constexpr bool useTokenTypeEmbedding() const noexcept

inline constexpr void setUseTokenTypeEmbedding(bool useTokenTypeEmbedding) noexcept

inline constexpr SizeType32 getMaxLoraRank() const noexcept

inline constexpr void setMaxLoraRank(SizeType32 maxLoraRank) noexcept

inline void setSpeculativeDecodingMode(SpeculativeDecodingMode mode) noexcept

inline bool hasSpeculativeDecodingModule() const noexcept

inline SpeculativeDecodingModule const &getSpeculativeDecodingModule() const noexcept

inline std::shared_ptr<SpeculativeDecodingModule const> getSpeculativeDecodingModulePtr() const noexcept

inline std::shared_ptr<SpeculativeDecodingModule> getSpeculativeDecodingModulePtr() noexcept

inline void setSpeculativeDecodingModule(std::shared_ptr<SpeculativeDecodingModule> const &speculativeDecodingModule) noexcept

inline nvinfer1::DataType getKvDataType() const noexcept

inline constexpr bool isTransformerBased() const noexcept

inline bool hasRnnConfig() const noexcept

inline std::optional<RnnConfig> getRnnConfig() const noexcept

inline void setRnnConfig(RnnConfig const &rnnConfig) noexcept

inline constexpr bool isRnnBased() const noexcept

inline std::vector<LayerType> const &getLayerTypes() const noexcept

inline void setLayerTypes(std::vector<LayerType> const &layerTypes) noexcept

inline constexpr SpeculativeDecodingMode getSpeculativeDecodingMode() const noexcept

inline void setLogitsDtype(nvinfer1::DataType inputDtype) noexcept

inline constexpr nvinfer1::DataType getLogitsDtype() const noexcept

inline void setUseShapeInference(bool useShapeInference) noexcept

inline bool useShapeInference() const noexcept

Public Static Functions

static inline std::vector<SizeType32> getOptProfilesSplitPoints() noexcept

Public Static Attributes

static constexpr std::array kOPT_PROFILES_SPLIT_POINTS = {64, 128, 256, 512, 1024}

Private Members

SizeType32 mVocabSize

SizeType32 mNbAttentionLayers

SizeType32 mNbRnnLayers

SizeType32 mNbHeads

SizeType32 mNbKvHeads

SizeType32 mHiddenSize

SizeType32 mSizePerHead

nvinfer1::DataType mDataType

bool mUseGptAttentionPlugin

bool mUseMambaConv1dPlugin

bool mInputPacked

bool mPagedKvCache

bool mPagedState

SizeType32 mTokensPerBlock

common::QuantMode mQuantMode

SizeType32 mMaxBatchSize

SizeType32 mMaxBeamWidth

SizeType32 mMaxInputLen

SizeType32 mMaxSequenceLen

std::optional<SizeType32> mMaxNumTokens

bool mComputeContextLogits

bool mComputeGenerationLogits

ModelVariant mModelVariant

bool mUseCustomAllReduce

SizeType32 mMaxPromptEmbeddingTableSize

bool mContextFMHA

bool mPagedContextFMHA

bool mUseXQA

bool mUseLoraPlugin

std::vector<LoraModule> mLoraModules

SizeType32 mMlpHiddenSize

SizeType32 mMaxLoraRank

std::optional<RnnConfig> mRnnConfig

SizeType32 mMaxEncoderLen = {}

SizeType32 mEncoderHiddenSize = {}

bool mUseCrossAttention

bool mUsePositionEmbedding

bool mUseTokenTypeEmbedding

std::vector<LayerType> mLayerTypes

std::shared_ptr<SpeculativeDecodingModule> mSpeculativeDecodingModule

SpeculativeDecodingMode mSpeculativeDecodingMode

nvinfer1::DataType mLogitsDtype

bool mUseShapeInference

struct RnnConfig

Public Members

SizeType32 stateSize = 0

SizeType32 convKernel = 0

SizeType32 rnnHiddenSize = 0

promptTuningParams.h

namespace tensorrt_llm

namespace runtime

template<typename TTensor> class GenericPromptTuningParams

Public Types

using TensorPtr = TTensor 

using SizeType32 = tensorrt_llm::runtime::SizeType32 

Public Functions

inline explicit GenericPromptTuningParams(TensorPtr embeddingTable = TensorPtr(), TensorPtr tasks = TensorPtr(), TensorPtr vocabSize = TensorPtr())

Public Members

TensorPtr embeddingTable

TensorPtr tasks

TensorPtr vocabSize

std::vector<bool> promptTuningEnabled

class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<ITensor::SharedPtr>

Public Types

using TensorPtr = ITensor::SharedPtr 

using SizeType32 = GenericPromptTuningParams::SizeType32 

Public Functions

inline explicit PromptTuningParams(TensorPtr embeddingTable = nullptr, TensorPtr tasks = nullptr, TensorPtr vocabSize = nullptr)

void fillTasksTensor(TensorPtr tasksHost, const SizeType32 batchSize, const SizeType32 numContextRequests, std::vector<SizeType32> const &reqBeamWidths, std::vector<SizeType32> const &reqPromptLengths, BufferManager const &manager, bool packedInput)

rawEngine.h

namespace tensorrt_llm

namespace runtime

class RawEngine

Public Types

enum Type

Values:

enumerator FilePath

enumerator AddressWithSize

enumerator HostMemory

Public Functions

inline explicit RawEngine(std::filesystem::path enginePath) noexcept

inline explicit RawEngine(void const *engineAddr, std::size_t engineSize) noexcept

inline explicit RawEngine(nvinfer1::IHostMemory const *engineBuffer) noexcept

inline Type getType() const

inline std::filesystem::path getPath() const

inline void const *getAddress() const

inline std::size_t getSize() const

inline nvinfer1::IHostMemory const *getHostMemory() const

Public Members

void const *mEngineAddr = {}

std::size_t mEngineSize = {}

Private Members

Type mType

std::filesystem::path mEnginePath

struct tensorrt_llm::runtime::RawEngine::[anonymous] [anonymous]

nvinfer1::IHostMemory const *mEngineBuffer = {}

request.h

namespace tensorrt_llm

namespace runtime

namespace decoder_batch

class Request

Public Types

using ConstTensorPtr = ITensor::SharedConstPtr 

using TensorPtr = ITensor::SharedPtr 

using BufferPtr = IBuffer::SharedPtr 

Public Functions

inline explicit Request(ConstTensorPtr ids, SizeType32 inputLen, std::optional<SizeType32> maxNewTokens = std::nullopt, std::optional<SizeType32> endId = std::nullopt)

Public Members

ConstTensorPtr ids

SizeType32 inputLen

std::optional<SizeType32> maxNewTokens

std::optional<SizeType32> endId

BufferPtr draftTokens

std::optional<TensorPtr> draftLogits

TensorPtr embeddingBias

TensorPtr badWordsList

TensorPtr stopWordsList

SizeType32 generatedTokensPerEngineStep

TensorPtr medusaPaths

TensorPtr medusaTreeIds

std::optional<executor::LookaheadDecodingConfig> lookaheadRuntimeConfig

samplingConfig.h

Defines

SET_FROM_OPTIONAL(varName, VarName, VarType)

namespace tensorrt_llm

namespace runtime

class SamplingConfig

Public Functions

inline explicit SamplingConfig(SizeType32 beamWidth = 1)

inline explicit SamplingConfig(std::vector<SamplingConfig> const &configs)

inline explicit SamplingConfig(executor::SamplingConfig const &samplingConfig, std::optional<executor::ExternalDraftTokensConfig> const &externalDraftTokensConfig)

inline bool validate()

inline bool operator==(SamplingConfig const &other) const

Public Members

SizeType32 beamWidth

OptVec<FloatType> temperature

OptVec<SizeType32> minLength

OptVec<FloatType> repetitionPenalty

OptVec<FloatType> presencePenalty

OptVec<FloatType> frequencyPenalty

OptVec<SizeType32> noRepeatNgramSize

OptVec<bool> outputLogProbs

OptVec<bool> cumLogProbs

OptVec<SizeType32> topK

OptVec<FloatType> topP

OptVec<uint64_t> randomSeed

OptVec<FloatType> topPDecay

OptVec<FloatType> topPMin

OptVec<TokenIdType> topPResetIds

OptVec<FloatType> beamSearchDiversityRate

OptVec<FloatType> lengthPenalty

OptVec<SizeType32> earlyStopping

OptVec<FloatType> draftAcceptanceThreshold

OptVec<std::vector<runtime::SizeType32>> topKMedusaHeads

std::optional<bool> normalizeLogProbs

Private Types

using FloatType = float

template<typename T> using OptVec = std::optional<std::vector<T>>

template<typename T> using Vec = std::vector<T>

Private Functions

template<typename T> inline bool validateVec(std::string name, OptVec<T> const &vec, T min, std::optional<T> max = std::nullopt)

Private Static Functions

template<typename T> static inline OptVec<T> fuseValues(std::vector<SamplingConfig> const &configs, std::function<OptVec<T>(size_t ci)> accessor, T defaultValue)

speculativeDecodingMode.h

namespace tensorrt_llm

namespace runtime

class SpeculativeDecodingMode

Public Types

using UnderlyingType = std::uint8_t

Public Functions

inline constexpr bool isNone() const

inline constexpr bool isDraftTokensExternal() const

inline constexpr bool isMedusa() const

inline constexpr bool isLookaheadDecoding() const

inline constexpr bool isExplicitDraftTokens() const

inline constexpr bool updatesPositionIds() const

inline constexpr bool requiresAttentionMask() const

inline constexpr bool predictsDraftTokens() const

inline constexpr bool needsKVCacheRewind() const

inline constexpr bool variableDraftLength() const

inline constexpr bool hasDraftLogits() const

inline constexpr bool needsDecoderPrologue() const

inline bool operator==(SpeculativeDecodingMode const &other) const

inline explicit constexpr SpeculativeDecodingMode(UnderlyingType state)

Public Static Functions

static inline constexpr auto None()

static inline constexpr auto DraftTokensExternal()

static inline constexpr auto Medusa()

static inline constexpr auto LookaheadDecoding()

static inline constexpr auto ExplicitDraftTokens()

Private Functions

inline constexpr bool anyBitSet(UnderlyingType bits) const

inline constexpr bool allBitSet(UnderlyingType bits) const

Private Members

UnderlyingType mState = {kNone}

Private Static Attributes

static constexpr UnderlyingType kNone = {1U << 0U}

static constexpr UnderlyingType kDraftTokensExternal = {1U << 1U}

static constexpr UnderlyingType kMedusa = {1U << 2U}

static constexpr UnderlyingType kLookaheadDecoding = {1U << 3U}

static constexpr UnderlyingType kExplicitDraftTokens = {1U << 4U}

speculativeDecodingModule.h

namespace tensorrt_llm

namespace runtime

class SpeculativeDecodingModule

Subclassed by tensorrt_llm::runtime::LookaheadModule, tensorrt_llm::runtime::MedusaModule

Public Functions

inline explicit SpeculativeDecodingModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens, SizeType32 maxNumPaths) noexcept

inline explicit SpeculativeDecodingModule() noexcept

virtual ~SpeculativeDecodingModule() = default

SpeculativeDecodingModule(SpeculativeDecodingModule const &o) = default

SpeculativeDecodingModule &operator=(SpeculativeDecodingModule const &o) = default

inline SizeType32 getMaxDraftPathLen() const noexcept

Returns:: max number of draft tokens that can be accepted by one step of the decoder

inline SizeType32 getMaxPathLen() const noexcept

one more than draft path len for prediction from primary head

Returns:: max number of tokens that a request can grow in one step of the decoder

inline SizeType32 getMaxDecodingDraftTokens() const noexcept

Returns:: max number of draft tokens processed by one step of the decoder

inline SizeType32 getMaxDecodingTokens() const noexcept

one more than decoding draft tokens for prediction from primary head

Returns:: max number of tokens processed by one step of the decoder

inline SizeType32 getNumPackedMasks() const noexcept

inline SizeType32 getMaxNumPaths() const noexcept

inline void setMaxDraftTokens(SizeType32 maxDraftTokens) noexcept

inline void setMaxDraftPathLen(SizeType32 maxDraftPathLen) noexcept

inline void setMaxNumPaths(SizeType32 maxNumPaths) noexcept

Private Functions

inline void computeNumPackedMasks() noexcept

Private Members

SizeType32 mMaxDraftPathLen

SizeType32 mMaxDecodingDraftTokens

SizeType32 mMaxNumPaths

SizeType32 mMaxNumPackedMasks

tllmLogger.h

namespace tensorrt_llm

namespace runtime

class TllmLogger : public nvinfer1::ILogger

Public Functions

void log(Severity severity, nvinfer1::AsciiChar const *msg) noexcept override

Severity getLevel()

void setLevel(Severity level)

worldConfig.h

namespace tensorrt_llm

namespace runtime

class WorldConfig

Public Functions

explicit WorldConfig(SizeType32 tensorParallelism = 1, SizeType32 pipelineParallelism = 1, SizeType32 rank = 0, SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)

inline constexpr SizeType32 getSize() const noexcept

inline constexpr SizeType32 getTensorParallelism() const noexcept

inline constexpr bool isTensorParallel() const noexcept

inline constexpr SizeType32 getPipelineParallelism() const noexcept

inline constexpr bool isPipelineParallel() const noexcept

inline constexpr SizeType32 getRank() const noexcept

inline constexpr SizeType32 getGpusPerNode() const noexcept

inline SizeType32 getGpusPerGroup() const noexcept

inline SizeType32 getDevice() const noexcept

inline SizeType32 getDeviceOf(SizeType32 rank) const noexcept

inline constexpr SizeType32 getPipelineParallelRank() const noexcept

inline constexpr SizeType32 getTensorParallelRank() const noexcept

inline constexpr SizeType32 getLocalRank() const noexcept

inline constexpr SizeType32 getNodeRank() const noexcept

inline constexpr SizeType32 getNodeRankOf(SizeType32 rank) const noexcept

inline constexpr bool isFirstPipelineParallelRank() const noexcept

inline constexpr bool isLastPipelineParallelRank() const noexcept: Is my rank the last rank in its pipeline?

inline constexpr SizeType32 getLastRank() const noexcept

std::vector<SizeType32> getPipelineParallelGroup() const

std::vector<SizeType32> getTensorParallelGroup() const

bool validMpiConfig() const

Public Static Functions

static WorldConfig mpi(SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType32> tensorParallelism = std::nullopt, std::optional<SizeType32> pipelineParallelism = std::nullopt, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)

Public Static Attributes

static constexpr SizeType32 kDefaultGpusPerNode = 1

Private Members

SizeType32 mTensorParallelism

SizeType32 mPipelineParallelism

SizeType32 mRank

SizeType32 mGpusPerNode

std::vector<SizeType32> mDeviceIds