Runtime

bufferManager.h

namespace tensorrt_llm
namespace runtime
class BufferManager
#include <bufferManager.h>

A helper class for managing memory on host and device.

Public Types

using IBufferPtr = IBuffer::UniquePtr
using ITensorPtr = ITensor::UniquePtr
using CudaStreamPtr = std::shared_ptr<CudaStream>

Public Functions

explicit BufferManager(CudaStreamPtr stream, bool trimPool = false)

Construct a BufferManager.

Parameters:

cudaStream[in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).

inline ~BufferManager()

Destructor.

IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.

ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.

IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an IBuffer of the given size and memory type.

ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an ITensor of the given dimensions and memory type.

inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const

Create an empty IBuffer of the given memory type. It may be resized later.

inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const

Create an empty ITensor of the given memory type. It may be reshaped later.

void setMem(IBuffer &buffer, int32_t value) const

Set the contents of the given buffer to value.

void setZero(IBuffer &buffer) const

Set the contents of the given buffer to zero.

void copy(void const *src, IBuffer &dst, MemoryType srcType) const

Copy src to dst.

void copy(IBuffer const &src, void *dst, MemoryType dstType) const

Copy src to dst.

inline void copy(void const *src, IBuffer &dst) const

Copy src to dst.

inline void copy(IBuffer const &src, void *dst) const

Copy src to dst.

void copy(IBuffer const &src, IBuffer &dst) const

Copy src to dst.

IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const

Copy src into a new IBuffer with a potentially different memory type.

ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const

Copy src into a new ITensor with a potentially different memory type.

template<typename T>
inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const

Copy src into a new IBuffer with a potentially different memory type.

template<typename T>
inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const

Copy src into a new ITensor with a potentially different memory type.

template<typename T>
inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const

Copy src into a new ITensor with a potentially different memory type.

CudaStream const &getStream() const

Get the underlying cuda stream.

std::size_t memoryPoolReserved() const

The current size of the memory reserved by the memory pool.

std::size_t memoryPoolUsed() const

The current size of the memory used by the memory pool.

std::size_t memoryPoolFree() const

The current size of the memory free in the memory pool.

void memoryPoolTrimTo(std::size_t size)

Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.

Public Static Functions

static IBufferPtr gpuSync(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an IBuffer of the given size on the GPU, using cudaMalloc.

static ITensorPtr gpuSync(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.

static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an IBuffer of the given size on the CPU.

static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an ITensor of the given dimensions on the CPU.

static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned IBuffer of the given size on the CPU.

static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned ITensor of the given dimensions on the CPU.

static IBufferPtr pinnedPool(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.

static ITensorPtr pinnedPool(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.

static IBufferPtr managed(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an IBuffer of the given size in UVM.

static ITensorPtr managed(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an ITensor of the given dimensions in UVM.

Public Static Attributes

static constexpr auto kBYTE_TYPE = nvinfer1::DataType::kUINT8

Private Members

CudaStreamPtr mStream
bool const mTrimPool

Private Static Functions

static void initMemoryPool(int device)
static std::size_t memoryPoolReserved(int device)
static std::size_t memoryPoolUsed(int device)
static inline std::size_t memoryPoolFree(int device)
static void memoryPoolTrimTo(int device, std::size_t size)

Friends

friend class ::BufferManagerTest

common.h

namespace tensorrt_llm
namespace runtime

Typedefs

using SizeType = std::int32_t
using TokenIdType = std::int32_t
template<typename T>
using StringPtrMap = std::unordered_map<std::string, std::shared_ptr<T>>

cudaEvent.h

namespace tensorrt_llm
namespace runtime
class CudaEvent

Public Types

using pointer = cudaEvent_t

Public Functions

inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)

Creates a new cuda event. The event will be destroyed in the destructor.

Parameters:

flags – Flags for event creation. By default, event timing is disabled.

inline explicit CudaEvent(pointer event, bool ownsEvent = true)

Pass an existing cuda event to this object.

Parameters:
  • event – The event to pass to this object.

  • ownsEvent – Whether this object owns the event and destroys it in the destructor.

inline pointer get() const

Returns the event associated with this object.

inline void synchronize() const

Synchronizes the event.

Private Types

using element_type = std::remove_pointer_t<pointer>
using EventPtr = std::unique_ptr<element_type, Deleter>

Private Members

EventPtr mEvent
class Deleter

Public Functions

inline explicit Deleter(bool ownsEvent)
inline explicit Deleter()
inline constexpr void operator()(pointer event) const

Private Members

bool mOwnsEvent

cudaStream.h

namespace tensorrt_llm
namespace runtime
class CudaStream

Public Functions

inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)

Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.

Parameters:
  • flags – Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed.

  • priority – Priority of the stream. Lower numbers represent higher priorities. See ::cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.

inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)

Pass an existing cuda stream to this object.

Parameters:
  • stream – The stream to pass to this object.

  • device – The device on which the stream was created.

  • ownsStream – Whether this object owns the stream and destroys it in the destructor.

inline explicit CudaStream(cudaStream_t stream)

Construct with an existing cuda stream or the default stream by passing nullptr.

inline int getDevice() const

Returns the device on which the stream was created.

inline cudaStream_t get() const

Returns the stream associated with this object.

inline void synchronize() const

Synchronizes the stream.

inline void record(CudaEvent::pointer event) const

Record an event on the stream.

inline void record(CudaEvent const &event) const

Record an event on the stream.

inline void wait(CudaEvent::pointer event) const

Wait for an event.

inline void wait(CudaEvent const &event) const

Wait for an event.

Private Types

using StreamPtr = std::unique_ptr<std::remove_pointer_t<cudaStream_t>, Deleter>

Private Members

StreamPtr mStream
int mDevice = {-1}
class Deleter

Public Functions

inline explicit Deleter(bool ownsStream)
inline explicit Deleter()
inline constexpr void operator()(cudaStream_t stream) const

Private Members

bool mOwnsStream

decodingInput.h

namespace tensorrt_llm
namespace runtime
class DecodingInput

Public Types

using TensorPtr = std::shared_ptr<ITensor const>

Public Functions

inline DecodingInput(SizeType maxLength, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxBatchSize, TensorPtr logits, TensorPtr endIds)

Public Members

SizeType step
SizeType maxLength
SizeType maxAttentionWindow
SizeType sinkTokenLength
SizeType maxBatchSize
SizeType maxStopWordsLen
SizeType maxBadWordsLen
TensorPtr logits
std::optional<std::vector<TensorPtr>> logitsVec
TensorPtr endIds
TensorPtr finished
TensorPtr sequenceLimitLength
TensorPtr embeddingBias
TensorPtr lengths
TensorPtr badWordsList
TensorPtr badWordsPtrs
TensorPtr badWordsLens
TensorPtr stopWordsList
TensorPtr stopWordsPtrs
TensorPtr stopWordsLens
TensorPtr noRepeatNgramSize
TensorPtr batchSlots
TensorPtr cacheIndirection
std::optional<MedusaInputs> medusaInputs
class MedusaInputs

Public Members

TensorPtr medusaPaths
TensorPtr medusaTreeIds
std::vector<std::vector<TensorPtr>> medusaLogits
TensorPtr medusaCurTokensPerStep
TensorPtr medusaTargetTokensPerStep

decodingOutput.h

namespace tensorrt_llm
namespace runtime
class DecodingOutput

Public Types

using TensorPtr = ITensor::SharedPtr

Public Functions

inline explicit DecodingOutput(TensorPtr ids)

Public Members

TensorPtr ids
TensorPtr newTokensSteps
TensorPtr newTokens
std::vector<TensorPtr> newTokensVec
TensorPtr finished
TensorPtr finishedSum
TensorPtr logProbs
TensorPtr cumLogProbs
TensorPtr parentIds
TensorPtr lengths
TensorPtr cacheIndirection
BeamHypotheses beamHypotheses
std::optional<MedusaOutputs> medusaOutputs

Public Static Attributes

static constexpr float kNegativeInfinity = -1e20f
class BeamHypotheses

Public Functions

void empty(BufferManager &manager)
void reshape(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)
void release()
void init(BufferManager &manager, TokenIdType endId)
BeamHypotheses slice(SizeType batchIndex, SizeType size) const

Public Members

TensorPtr outputIdsTgt
TensorPtr sequenceLengthsTgt
TensorPtr cumLogProbs
TensorPtr normedScores
TensorPtr logProbs
TensorPtr minNormedScores
TensorPtr numBeams
TensorPtr isDone
class MedusaOutputs

Public Members

TensorPtr medusaNextDraftTokens
TensorPtr medusaAcceptedTokensLen
TensorPtr medusaAcceptedLengthsCumSum
TensorPtr medusaPathsOffsets

generationInput.h

namespace tensorrt_llm
namespace runtime
template<typename TTensor, typename PromptTuningParams>
class GenericGenerationInput
#include <generationInput.h>

  • endId, is the token ID that marks the end of the input sequence (aka EOS or end-of-sequence). It’s 50,256 for the GPT2 model which has a vocabulary of 50,257 tokens, for example,

  • padId, is the token ID that is used for padding (i.e. fills in the slots that are at an index greater-or-equal to the input length for padded sequences). It can be set to the same value as endId,

  • ids, is the tensor of input IDs. That tensor must be allocated on the GPU. When the input tensor is padded, the shape of ids is [batchSize, maxInputLength], where batchSize and maxInputLength must respect the maximum sizes in sessionConfig passed to the GptSession constructor. When the input is packed, the shape of ids is [numTokens], where numTokens is the sum of the lengths of the different sequences in the batch,

  • lengths, is the tensor of input sequence lengths. That tensor must be allocated on the GPU and contain batchSize values,

  • packed, indicates if the ids tensor is packed or padded. In this release, that flag must match the value passed to the constructor through the instance of the ModelConfig class. In a future release, the session may be made more flexible and automatically pad or pack the input,

  • embeddingBiasOpt, is a tensor of floating-point values on the GPU that contains the bias to add to the logits during sampling (after the projection from hidden states to logits as the last step of the model). This tensor must have vocabSize elements (as defined in the modelConfig argument passed to the constructor),

  • badWordsList, is a tensor of integers on the GPU that encodes the list of words that have to be banned from generated sequences. Its shape is [2, badWordsLength], as explained below, or [batchSize, 2, badWordsLength] when there is a different list for each sequence in the batch,

  • stopWordsList, is a tensor of integers on the GPU that encodes the list of words that trigger the end of the generation for a sequence. Its shape is [2, stopWordsLength], as explained below, or [batchSize, 2, stopWordsLength] when there is a different list for each sequence in the batch,

  • maxNewTokens, is the maximum number of tokens to generate.

The badWordsList and stopWordsList tensors have the same shape [2, length]. Let’s consider an example with three words to describe the representation of those lists. The first word contains tokens [5, 7, 3], the second one contains [9, 2] and the third one is composed of tokens [6, 2, 4, 1]. In total, there are 9 tokens. That’s the length. The shape of the tensor is [2, 9]. The first row of the tensor must contain the 9 token IDs and the second row must store the inclusive prefix-sum of the word lengths as shown on the following diagram:

   0           3       5              9
   |           |       |              |
   V           V       V              V
[  5,  7,  3,  9,  2,  6,  2,  4,  1]
[  3,  5,  9, -1, -1, -1, -1, -1, -1]

In case all the words are made of a single token, the inner-most dimension of the tensor must be increased by 1 (i.e. the length for 4 words, each made of a single token, must be 5 instead of 4 &#8212; the shape is [2, 5]).

Public Types

using TensorPtr = TTensor

Public Functions

inline explicit GenericGenerationInput(SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

Public Members

SizeType endId
SizeType padId
TensorPtr ids
TensorPtr lengths
bool packed
TensorPtr embeddingBias
TensorPtr badWordsList
TensorPtr stopWordsList
std::optional<SizeType> maxNewTokens
PromptTuningParams promptTuningParams
class GenerationInput : public tensorrt_llm::runtime::GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>

Public Types

using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
using TensorPtr = Base::TensorPtr

Public Functions

inline explicit GenerationInput(SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

generationOutput.h

namespace tensorrt_llm
namespace runtime
template<typename TTensor>
class GenericGenerationOutput
#include <generationOutput.h>

  • ids, is a tensor that contains the output token IDs. Its shape is [batchSize, beamWidth, maxSeqLength] where maxSeqLength is the sum of maxInputLength and maxNewTokens. After generation, it contains, for each sequence, a copy of the input tokens followed by the output tokens. When a sequence is shorter than maxSeqLength, padding tokens are added at the end of the sequence.

Note that the shape of that tensor is different in this version of TensorRT-LLM from its shape in previous versions where it was .

  • logProbs, is a tensor of floating-point values on the GPU to store the log-prob of the generated tokens. Its shape is [maxNewTokens, batchSize, beamWidth]. Its shape will likely change in a future release to match the shape of the output ids tensor.

  • contextLogits, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the context. Its shape is [batchSize, maxSequenceLength, vocabSizePadded]. If use remove_input_padding, its shape is [packedSize, vocabSizePadded]. This buffer will only be filled in if the TensorRT engine was built with the gather_context_logits or gather_all_token_logits parameter enabled.

    After inference is complete, you can get the context logits in GenerationOutput.contextLogits, these are variables on the GPU. For specific acquisition methods, please refer to the example of gptSessionBenchmark.cpp.

    It is important to point out that enabling the computation may have an impact on performance (the language modeling head (LM head) has to perform a matrix multiplication on all the context tokens instead of a just the last one).

  • generationLogits, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the generation. Its shape is [batchSize, beamWidth, maxOutputLen, vocabSizePadded]. This buffer will only be filled in if the TensorRT engine was built with the gather_generation_logits or gather_all_token_logits parameter enabled.

    Generation logits can also be obtained through GenerationOutput.generationLogits after inference is completed.

  • onTokenGenerated, is a callback function invoked in the generation loop to pass newly generated tokens to the caller while the loop continues to execute. An implementation of that callback must accept the output ids tensor, the generation step and a boolean flag that indicates if the generation is complete.

Public Types

using TensorPtr = TTensor
using Callback = std::function<void(TensorPtr const &ids, SizeType step, bool finished)>

Public Functions

inline explicit GenericGenerationOutput(TensorPtr ids, TensorPtr lengths)

Public Members

TensorPtr ids
TensorPtr lengths
TensorPtr cumLogProbs
TensorPtr logProbs
TensorPtr contextLogits
TensorPtr generationLogits
Callback onTokenGenerated
class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<ITensor::SharedPtr>

Public Types

using Base = GenericGenerationOutput<ITensor::SharedPtr>
using TensorPtr = Base::TensorPtr

Public Functions

inline explicit GenerationOutput(TensorPtr ids, TensorPtr lengths)

gptDecoder.h

namespace tensorrt_llm
namespace layers
namespace runtime
class IGptDecoder

Subclassed by tensorrt_llm::runtime::GptDecoder< T >

Public Types

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

virtual ~IGptDecoder() = default
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType maxSequenceLength, std::optional<TensorPtr> const &batchSlots = std::nullopt) = 0
virtual bool forward(DecodingOutput &output, DecodingInput const &input) = 0
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) = 0
virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager) = 0
virtual SamplingConfig const &getSamplingConfig() = 0

Public Static Functions

static void acceptDraftTokensByIds(ITensor const &targetTokenIds, ITensor const &draftTokenIds, ITensor const &contextLengths, ITensor const &numDraftTokens, ITensor &sequenceLengths, ITensor const &finishedVec, ITensor &finishedFinal, ITensor &finishedSum, ITensor const &batchSlots, BufferManager::CudaStreamPtr const &stream)
static void acceptDraftTokensByLogits(ITensor &draftLogits, ITensor const &targetLogits, ITensor &draftProbs, ITensor &targetProbs, ITensor const &numDraftTokens, ITensor &finished, ITensor const &batchSlots, SizeType vocabSize, SizeType vocabSizePadded, bool useRandomAcceptThreshold, float randomAcceptThreshold, curandState_t *curandState, BufferManager::CudaStreamPtr const &stream)
static void updateKVCacheBasedOnAcceptedTokens(ITensor const &acceptedOffsets, ITensor const &packedAcceptedIds, ITensor const &pointerArray, ITensor const &pastKeyValueLengths, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, BufferManager::CudaStreamPtr stream, SizeType rewindDraftTokenCount, SizeType maxAttentionWindow, SizeType maxBlocksPerSeq, nvinfer1::DataType dtype)
static inline std::unique_ptr<IGptDecoder> create(DecodingMode const &mode, nvinfer1::DataType dtype, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength, BufferManager::CudaStreamPtr const &stream, std::optional<runtime::SizeType> maxTokensPerStep = std::nullopt, std::optional<runtime::SizeType> maxNumMedusaHeads = std::nullopt)
template<typename T>
class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder

Public Types

using CudaStreamPtr = BufferManager::CudaStreamPtr
using TensorPtr = std::shared_ptr<ITensor>

Public Functions

GptDecoder(DecodingMode const &mode, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength, CudaStreamPtr const &stream, std::optional<runtime::SizeType> maxTokensPerStep = std::nullopt, std::optional<runtime::SizeType> maxNumMedusaHeads = std::nullopt)
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType maxSequenceLength, std::optional<TensorPtr> const &batchSlots = std::nullopt) override
virtual bool forward(DecodingOutput &output, DecodingInput const &input) override
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) override
virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager) override
inline virtual SamplingConfig const &getSamplingConfig() override

Private Members

BufferManager mManager
std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer
TensorPtr mLogProbsTiled
SamplingConfig mSamplingConfig
cudaDeviceProp mProp
size_t mMaxBatchSize

gptDecoderBatch.h

namespace tensorrt_llm
namespace runtime
class GptDecoderBatch : public tensorrt_llm::runtime::IGptDecoderBatch
#include <gptDecoderBatch.h>

GPT decoder class with support for in-flight batching.

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>
using TensorPtr = ITensor::SharedPtr
using SharedConstPtr = ITensor::SharedConstPtr

Public Functions

GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream)
virtual void setup(DecodingMode const &mode, SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, SizeType maxTokensPerStep, bool fusedDecoder, nvinfer1::DataType dtype, GptModelConfig const &modelConfig) override

Setup the decoder before calling forward()

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) override

Initialize the decoder with new batch of inputs.

virtual void newRequests(std::vector<SizeType> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs) override

Initialize batched decoder at seqSlots with a new requests.

virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override

Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::Token const &token) override

Wait for the call to forwardAsync associated with a token to complete.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override

Run one step for all requests without blocking the host thread.

virtual void forwardSync() override

Wait for the last call to forwardAsync to complete.

inline virtual std::vector<bool> getFinished() const override
Returns:

[batchSize], indicators of finished requests

inline virtual TensorPtr getOutputIds(SizeType batchIdx) const override
Parameters:

batchIdx – index of the batch

Returns:

[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

inline virtual TensorPtr getOutputIds() const override
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu

virtual CudaEvent finalize(SizeType batchIdx) const override

Gather final beam search results for request batchIdx. Result will only be available after event returned.

virtual void finalize() const override

Gather final beam search results for all requests.

inline virtual TensorPtr getParentIds() const override
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu

inline virtual TensorPtr getCumLogProbs() const override
Returns:

[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getCumLogProbs(SizeType batchIdx) const override
Returns:

[maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs() const override
Returns:

[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs(SizeType batchIdx) const override
Returns:

[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getAllNewTokens() const override

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:

[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

inline virtual TensorPtr getNewTokens(SizeType iter = 0) const override

Get tokens generated in one step of last forward pass.

Parameters:

iter – The iteration within [0; maxTokensPerStep) for which to get the tokens

Returns:

[batchSize, beamWidth], tokens generated in iter (per beam), on gpu

inline virtual std::vector<SizeType> getNbSteps() const override
Returns:

[batchSize], the number of generation steps executed on each request

inline virtual TensorPtr getNbFinished() const override
Returns:

[1], number of finished sequences, in pinned host memory

inline virtual TensorPtr getNextDraftTokens() const override
Returns:

[batchSize, maxTokensPerStep-1], predicted draft tokens for next step, on gpu

inline virtual TensorPtr getMedusaAcceptedLengthsCumSum() const override
Returns:

[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu

inline virtual TensorPtr getMedusaAcceptedPackedPaths() const override
Returns:

[batchSize * maxMedusaHeads], accepted paths packed into continuous tensor, on gpu

Private Types

using GptDecoderPtr = std::unique_ptr<IGptDecoder>
using DecodingInputPtr = std::unique_ptr<DecodingInput>
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>

Private Functions

CudaEvent postProcessRequest(SizeType batchIdx) const

Gather final beam search results for request batchIdx.

void newRequest(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig)

Initialize the decoder at batchIdx with a new request.

void allocateMedusaBuffers()

Allocate buffers for medusa decoding.

void setupMedusa(GptModelConfig const &modelConfig)

Setup buffers for medusa decoding.

void newRequestSpeculativeDecoding(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig)

Setups decoder internal tensors for new speculative decoding request.

void newRequestMedusa(SizeType batchIdx, decoder_batch::Request const &request)

Setups decoder internal tensors for new Medusa request.

void forwardAsyncUnfusedDecoder(SizeType step, decoder_batch::Output &output, decoder_batch::Input const &input, CudaEvent const &eventStart)

Asynchronously calls unfused decoder for whole batch in loop.

void forwardAsyncFusedDecoder(SizeType step, decoder_batch::Output &output, decoder_batch::Input const &input, CudaEvent const &eventStart)

Asynchronously calls fused decoder for whole batch.

Private Members

std::size_t const mVocabSize
std::size_t const mVocabSizePadded
CudaStreamPtr mStream
BufferManager mBufferManager
TokenPtr mForwardToken
CudaEvent mForwardEvent
std::vector<CudaStreamPtr> mStreams
std::vector<GptDecoderPtr> mDecoders
std::vector<DecodingInputPtr> mDecodingInputs
std::vector<DecodingOutputPtr> mDecodingOutputs
DecodingInputPtr mJointDecodingInput
DecodingOutputPtr mJointDecodingOutput
std::vector<bool> mAcceptByLogits
TensorPtr mNumDraftTokens
TensorPtr mCurandStates
std::vector<SizeType> mNbSteps
std::vector<bool> mFinished
TensorPtr mFinishedSum
std::vector<SizeType> mMaxNewTokens
std::vector<SizeType> mBeamWidths
std::vector<SizeType> mGeneratedTokensPerEngineStep
TensorPtr mFinishedSteps
TensorPtr mDraftProbs
TensorPtr mTargetProbs
TensorPtr mDraftTokenIds
TensorPtr mDraftLogits
TensorPtr mBatchSlotsSetup
TensorPtr mBatchSlotsDecoder
TensorPtr mBatchSlotsAcceptTokens
TensorPtr mBatchSlotsAcceptLogits
TensorPtr mTargetLogitsPtrs
SizeType mMaxSequenceLength = {}
SizeType mMaxAttentionWindow = {}
SizeType mSinkTokenLength = {}
SizeType mActualBatchSize = {}
SizeType mMaxTokensPerEngineStep = {}
SizeType mMaxStopWordsLen = {}
SizeType mMaxBadWordsLen = {}
SizeType mMaxTokensPerDecoderStep = {}
bool mFusedDecoder = {false}
bool mUseMedusa = {false}

gptJsonConfig.h

namespace tensorrt_llm
namespace runtime
class GptJsonConfig

Public Functions

inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType tensorParallelism, SizeType pipelineParallelism, GptModelConfig const &modelConfig)
inline GptModelConfig getModelConfig() const
inline std::string const &getName() const
inline std::string const &getVersion() const
inline std::string const &getPrecision() const
inline constexpr SizeType getTensorParallelism() const
inline constexpr SizeType getPipelineParallelism() const
inline constexpr SizeType getWorldSize() const
std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const
inline std::string engineFilename(WorldConfig const &worldConfig) const

Public Static Functions

static GptJsonConfig parse(std::string const &json)
static GptJsonConfig parse(std::istream &json)
static GptJsonConfig parse(std::filesystem::path const &path)

Private Members

std::string const mName
std::string const mVersion
std::string const mPrecision
SizeType const mTensorParallelism
SizeType const mPipelineParallelism
GptModelConfig const mGptModelConfig

gptModelConfig.h

namespace tensorrt_llm
namespace runtime
struct MambaConfig

Public Members

SizeType dState = 0
SizeType dConv = 0
SizeType expand = 0
class GptModelConfig

Public Types

enum class ModelVariant : std::int32_t

Values:

enumerator kGpt
enumerator kGlm
enumerator kMamba

Public Functions

inline explicit GptModelConfig(SizeType vocabSize, SizeType nbLayers, SizeType nbHeads, SizeType hiddenSize, nvinfer1::DataType dtype)
inline constexpr SizeType getVocabSize() const noexcept
inline constexpr SizeType getVocabSizePadded(SizeType worldSize) const noexcept
inline constexpr SizeType getNbLayers(SizeType pipelineParallelism = 1) const
inline constexpr SizeType getNbHeads() const noexcept
inline constexpr SizeType getNbKvHeads() const noexcept
inline constexpr void setNbKvHeads(SizeType nbKvHeads) noexcept
inline constexpr SizeType getHiddenSize() const noexcept
inline constexpr SizeType getSizePerHead() const noexcept
inline constexpr void setSizePerHead(SizeType sizePerHead) noexcept
inline constexpr nvinfer1::DataType getDataType() const noexcept
inline constexpr bool useGptAttentionPlugin() const noexcept
inline constexpr void useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept
inline constexpr bool useMambaConv1dPlugin() const noexcept
inline constexpr void useMambaConv1dPlugin(bool useMambaConv1dPlugin) noexcept
inline constexpr bool usePackedInput() const noexcept
inline constexpr void usePackedInput(bool inputPacked) noexcept
inline constexpr bool usePagedKvCache() const noexcept
inline constexpr void usePagedKvCache(bool pagedKvCache) noexcept
inline constexpr bool usePagedState() const noexcept
inline constexpr void usePagedState(bool pagedState) noexcept
inline constexpr SizeType getTokensPerBlock() const noexcept
inline constexpr void setTokensPerBlock(SizeType TokensPerBlock) noexcept
inline constexpr common::QuantMode getQuantMode() const noexcept
inline constexpr void setQuantMode(common::QuantMode QuantMode) noexcept
inline constexpr bool supportsInflightBatching() const noexcept
inline constexpr SizeType getMaxBatchSize() const noexcept
inline constexpr void setMaxBatchSize(SizeType maxBatchSize) noexcept
inline constexpr SizeType getMaxBeamWidth() const noexcept
inline constexpr void setMaxBeamWidth(SizeType maxBeamWidth) noexcept
inline constexpr SizeType getMaxInputLen() const noexcept
inline constexpr void setMaxInputLen(SizeType maxInputLen) noexcept
inline constexpr SizeType getMaxSequenceLen() const noexcept
inline constexpr void setMaxSequenceLen(SizeType maxSequenceLen) noexcept
inline constexpr std::optional<SizeType> getMaxNumTokens() const noexcept
inline constexpr void setMaxNumTokens(std::optional<SizeType> maxNumTokens) noexcept
inline constexpr bool usePromptTuning() const noexcept
inline constexpr SizeType getMaxPromptEmbeddingTableSize() const noexcept
inline constexpr void setMaxPromptEmbeddingTableSize(SizeType maxPromptEmbeddingTableSize) noexcept
inline constexpr bool computeContextLogits() const noexcept
inline constexpr void computeContextLogits(bool computeContextLogits) noexcept
inline constexpr bool computeGenerationLogits() const noexcept
inline constexpr void computeGenerationLogits(bool computeGenerationLogits) noexcept
inline ModelVariant getModelVariant() const
inline void setModelVariant(ModelVariant modelVariant)
inline constexpr bool useCustomAllReduce() const noexcept
inline constexpr void useCustomAllReduce(bool customAllReduce) noexcept
inline constexpr void setMaxDraftLen(SizeType maxDraftLen) noexcept
inline SizeType getMaxDraftLen() const
inline constexpr SizeType getMaxTokensPerStep() const noexcept
inline constexpr void setUseContextFMHAForGeneration(bool useContextFMHAForGeneration) noexcept
inline constexpr bool getContextFMHAForGeneration() const noexcept
inline constexpr void setPagedContextFMHA(bool pagedContextFMHA) noexcept
inline constexpr bool getPagedContextFMHA() const noexcept
inline constexpr bool useLoraPlugin() const noexcept
inline constexpr void useLoraPlugin(bool useLoraPlugin) noexcept
inline std::vector<LoraModule> const &getLoraModules() const noexcept
inline void setLoraModules(std::vector<LoraModule> const &loraModules) noexcept
inline constexpr SizeType getMlpHiddenSize() const noexcept
inline constexpr void setMlpHiddenSize(SizeType mlpHiddenSize) noexcept
inline constexpr SizeType getMaxLoraRank() const noexcept
inline constexpr void setMaxLoraRank(SizeType maxLoraRank) noexcept
inline constexpr bool useMedusa() const noexcept
inline std::optional<MedusaModule> getMedusaModule() const noexcept
inline void setMedusaModule(MedusaModule const &medusaModule) noexcept
inline nvinfer1::DataType getKvDataType() const noexcept
inline constexpr bool isTransformerBased() const noexcept
inline bool hasMambaConfig() const noexcept
inline std::optional<MambaConfig> getMambaConfig() const noexcept
inline void setMambaConfig(MambaConfig const &mambaConfig) noexcept
inline constexpr bool isSsmBased() const noexcept

Private Members

SizeType mVocabSize
SizeType mNbLayers
SizeType mNbHeads
SizeType mNbKvHeads
SizeType mHiddenSize
SizeType mSizePerHead
nvinfer1::DataType mDataType
bool mUseGptAttentionPlugin
bool mUseMambaConv1dPlugin
bool mInputPacked
bool mPagedKvCache
bool mPagedState
SizeType mTokensPerBlock
common::QuantMode mQuantMode
SizeType mMaxBatchSize
SizeType mMaxBeamWidth
SizeType mMaxInputLen
SizeType mMaxSequenceLen
std::optional<SizeType> mMaxNumTokens
bool mComputeContextLogits
bool mComputeGenerationLogits
ModelVariant mModelVariant
bool mUseCustomAllReduce
SizeType mMaxPromptEmbeddingTableSize
SizeType mMaxDraftLen
bool mUseContextFMHAForGeneration
bool mPagedContextFMHA
bool mUseLoraPlugin
std::vector<LoraModule> mLoraModules
SizeType mMlpHiddenSize
SizeType mMaxLoraRank
std::optional<MedusaModule> mMedusaModule
std::optional<MambaConfig> mMambaConfig

gptSession.h

namespace tensorrt_llm
namespace batch_manager
namespace kv_cache_manager
namespace runtime
class GptSession

Public Types

using LoggerPtr = std::shared_ptr<nvinfer1::ILogger>

Public Functions

GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
Parameters:
  • sessionConfig – Configuration of the session,

  • modelConfig – Description of the model,

  • worldConfig – Description of the environment,

  • engineBuffer – The compiled TensorRT engine (const void*),

  • engineSize – The size in bytes of the TensorRT engine (size_t),

  • logger – The optional logger.

inline GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)
inline GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)
nvinfer1::ILogger &getLogger() const
BufferManager const &getBufferManager() const
inline GptModelConfig const &getModelConfig() const
inline WorldConfig const &getWorldConfig() const
inline int getDevice() const noexcept
inline bool getNormalizeLogProbs() const noexcept
nvinfer1::DataType getLogitDataType() const
void generate(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig, std::shared_ptr<GenerationProfiler> const generationProfiler = nullptr)

This function performs the generation loop.

Given input tensors to read from, output tensors to populate, that member function can be produced or each sequence has reached completion (due to the production will run the generation loop until it reaches the maximum number of tokens that of “end-of-sequence” or a word in the list of “stop words”). The pseudo-code of that function looks like (member function names were changed to keep the presentation simple):

// Have all the sequences in the batch reached completion?
bool allFinished = false;

// Until all sequences are finished or the number of steps reaches the limit...
for (int step = 0; !allFinished && step < maxNewTokens; ++step) {

// Trigger the computation of the logits...
computeLogits(...);

// Run the sampling to produce a token (for each active sequence) from the logits.
allFinished = generateTokensFromLogits(...);

// Callback to stream the output tokens while the generation loop continues.
onTokenGenerated(...);
}

Private Types

using KvCacheManager = batch_manager::kv_cache_manager::KVCacheManager
using KvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig
using TensorPtr = runtime::ITensor::SharedPtr
using TokenGeneratedCallback = std::function<void(SizeType step, bool finished)>

Private Functions

inline bool useCudaGraphs()
void generateBatched(std::vector<GenerationOutput> &microBatchesOutputs, std::vector<GenerationInput> const &microBatchesInputs, SamplingConfig const &samplingConfig, TokenGeneratedCallback const &onTokenGenerated, std::shared_ptr<GenerationProfiler> const generationProfiler)
void setup(Config const &sessionConfig)
void createContexts()
void createBuffers(SizeType numMicroBatches)
void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches, DecodingMode const &decodingMode)
void createKvCacheManager(SizeType batchSize, SizeType beamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, KvCacheConfig const &config)
void createCustomAllReduceWorkspace(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)
void executeContextStep(std::vector<GenerationInput> const &generationBatchesInputs, std::vector<SizeType> const &generationBatchesOffsets, KvCacheManager const *kvCacheManager)
SizeType executeGenerationStep(SizeType step, std::vector<GenerationInput> const &microBatchesInputs, std::vector<GenerationOutput> &microBatchesOutputs, std::vector<SizeType> const &microBatchOffsets, KvCacheManager *kvCacheManager, std::vector<bool> &microBatchesFinished)
void decoderStepAsync(SizeType decoderStep, SizeType microBatchId)

Execute decoder on last PP rank, receive decoder output on other PP ranks.

bool shouldStopSync(SizeType batchSize, SizeType beamWidth, SizeType microBatchId)

Synchronize with the decoder and return the shouldStop flag.

void finalize(SizeType microBatchId)

Collect final output ids and log probs on last PP rank and send them to first PP rank.

Receives are asynchronous on host, so synchronization is required before access.

void kvCacheAddSequences(SizeType beamWidth, SizeType microBatchId, SizeType firstBatchIdx)
ITensor::SharedPtr initDecoder(ITensor &outputIds, GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, SizeType microBatchId) const

Populate outputIds and return reference to newTokens tensor.

TokenGeneratedCallback createOnTokenGeneratedCallback(GenerationOutput &outputs)

Private Members

GptModelConfig const mModelConfig
WorldConfig const mWorldConfig
int mDevice = {-1}
std::shared_ptr<NcclCommunicator> mPipelineComm
std::shared_ptr<CudaStream> mCommStream
CudaEvent mCommEvent = {}
ITensor::SharedPtr mCommPtrs
std::vector<std::shared_ptr<IpcMemory>> mIpcMemoryHandles
SizeType mDecoderMaxSequenceLength = {}
SizeType mDecoderMaxAttentionWindow = {}
SizeType mDecoderSinkTokenLength = {}
LoggerPtr mLogger
std::shared_ptr<TllmRuntime> mRuntime
std::shared_ptr<KvCacheManager> mKvCacheManager
MicroBatchConfig mMicroBatchConfig
std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders
std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers
std::vector<CudaEvent> mReceivedEvents
bool mCudaGraphMode = {false}
std::vector<CudaGraphExecutor> mCudaGraphInstances
bool mNormalizeLogProbs = true

Friends

friend class batch_manager::TrtGptModelV1
class Config
#include <gptSession.h>

Configuration for session execution and buffer sizes. generate may be called with batch size and beam width smaller than the configured parameters.

maxBatchSize will be divided by the number of micro batches to initialize each batch buffer.

Public Functions

inline Config(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength)

Public Members

SizeType maxBatchSize
SizeType maxBeamWidth
SizeType maxSequenceLength
bool decoderPerRequest = {false}
bool cudaGraphMode = {false}
KvCacheConfig kvCacheConfig = {}
std::optional<SizeType> ctxMicroBatchSize = std::nullopt
std::optional<SizeType> genMicroBatchSize = std::nullopt
std::optional<DecodingMode> decodingMode = std::nullopt
bool normalizeLogProbs = true
class CudaGraphExecutor

Public Functions

CudaGraphExecutor() = default
inline ~CudaGraphExecutor()
inline bool hasInstance()
void clear()
void prepareNextGraph(TllmRuntime const &runtime, SizeType nextContextId)
void launch(CudaStream const &stream)

Private Functions

void create(cudaGraph_t const &graph)
bool update(cudaGraph_t const &graph)
void uploadToStream(CudaStream const &stream)

Private Members

cudaGraphExec_t mInstance
class GenerationProfiler
#include <gptSession.h>

Optional profiler class to profile the generation phase of an inference request.

Public Functions

inline GenerationProfiler()
inline CudaEvent const &getStart() const
inline CudaEvent const &getEnd() const
inline float getElapsedTimeMs()

Public Static Attributes

static constexpr unsigned int flags = {cudaEventDefault}

Private Members

CudaEvent start
CudaEvent end
class MicroBatchConfig

Public Functions

inline MicroBatchConfig()
explicit MicroBatchConfig(SizeType maxBatchSize, SizeType pipelineParallelism, std::optional<SizeType> genMicroBatchSize, std::optional<SizeType> ctxMicroBatchSize)
inline constexpr SizeType numCtxPerGen() const
inline constexpr SizeType getGenGraphId(SizeType flipFlopId, SizeType generationBatchId) const

flip-flop between 2 graph instances for each generation batch.

Public Members

SizeType numCtxBatches
SizeType numGenBatches
SizeType ctxBatchSize
SizeType genBatchSize
namespace utils

Functions

std::vector<uint8_t> loadEngine(std::string const &enginePath)

iBuffer.h

template<>
struct MemoryTypeString<MemoryType::kGPU>

Public Static Attributes

static constexpr auto value = "GPU"
template<>
struct MemoryTypeString<MemoryType::kCPU>

Public Static Attributes

static constexpr auto value = "CPU"
template<>
struct MemoryTypeString<MemoryType::kPINNED>

Public Static Attributes

static constexpr auto value = "PINNED"
template<>
struct MemoryTypeString<MemoryType::kUVM>

Public Static Attributes

static constexpr auto value = "UVM"
template<>
struct DataTypeTraits<nvinfer1::DataType::kFLOAT>

Public Types

using type = float

Public Static Attributes

static constexpr char name[] = "float"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kHALF>

Public Types

using type = half

Public Static Attributes

static constexpr char name[] = "half"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT8>

Public Types

using type = std::int8_t

Public Static Attributes

static constexpr char name[] = "int8"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32>

Public Types

using type = std::int32_t

Public Static Attributes

static constexpr char name[] = "int32"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64>

Public Types

using type = std::int64_t

Public Static Attributes

static constexpr char name[] = "int64"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32, true>

Public Types

using type = std::uint32_t

Public Static Attributes

static constexpr char name[] = "uint32"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64, true>

Public Types

using type = std::uint64_t

Public Static Attributes

static constexpr char name[] = "uint64"
static constexpr auto size = sizeof(type)
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned>

Public Types

using type = bool

Public Static Attributes

static constexpr char name[] = "bool"
static constexpr auto size = sizeof(type)
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned>

Public Types

using type = std::uint8_t

Public Static Attributes

static constexpr char name[] = "uint8"
static constexpr auto size = sizeof(type)
template<>
struct TRTDataType<std::int8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8
template<>
struct TRTDataType<std::int32_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32
template<>
struct TRTDataType<std::uint32_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
template<>
struct TRTDataType<std::int64_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64
template<>
struct TRTDataType<std::uint64_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
template<>
struct TRTDataType<std::uint8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8
namespace tensorrt_llm
namespace runtime

Typedefs

template<typename T>
using PointerElementType = typename std::remove_reference_t<T>::element_type

Enums

enum class MemoryType : std::int32_t

Values:

enumerator kGPU
enumerator kCPU
enumerator kPINNED
enumerator kUVM

Functions

template<typename T>
std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::shared_ptr<T> const &ptr) noexcept
template<typename T, typename D>
std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::unique_ptr<T, D> &&ptr) noexcept
template<typename T>
T const *bufferCast(IBuffer const &buffer)
template<typename T>
T *bufferCast(IBuffer &buffer)
std::ostream &operator<<(std::ostream &output, IBuffer const &buffer)

Utility function to print a buffer.

template<MemoryType T>
struct MemoryTypeString
template<> kGPU >

Public Static Attributes

static constexpr auto value = "GPU"
template<> kCPU >

Public Static Attributes

static constexpr auto value = "CPU"
template<> kPINNED >

Public Static Attributes

static constexpr auto value = "PINNED"
template<> kUVM >

Public Static Attributes

static constexpr auto value = "UVM"
template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false>
struct DataTypeTraits
#include <iBuffer.h>

For converting a TensorRT data type to a C++ data type.

template<> kFLOAT >

Public Types

using type = float

Public Static Attributes

static constexpr char name[] = "float"
static constexpr auto size = sizeof(type)
template<> kHALF >

Public Types

using type = half

Public Static Attributes

static constexpr char name[] = "half"
static constexpr auto size = sizeof(type)
template<> kINT8 >

Public Types

using type = std::int8_t

Public Static Attributes

static constexpr char name[] = "int8"
static constexpr auto size = sizeof(type)
template<> kINT32 >

Public Types

using type = std::int32_t

Public Static Attributes

static constexpr char name[] = "int32"
static constexpr auto size = sizeof(type)
template<> kINT64 >

Public Types

using type = std::int64_t

Public Static Attributes

static constexpr char name[] = "int64"
static constexpr auto size = sizeof(type)
template<> kINT32, true >

Public Types

using type = std::uint32_t

Public Static Attributes

static constexpr char name[] = "uint32"
static constexpr auto size = sizeof(type)
template<> kINT64, true >

Public Types

using type = std::uint64_t

Public Static Attributes

static constexpr char name[] = "uint64"
static constexpr auto size = sizeof(type)
template<bool kUnsigned> kBOOL, kUnsigned >

Public Types

using type = bool

Public Static Attributes

static constexpr char name[] = "bool"
static constexpr auto size = sizeof(type)
template<bool kUnsigned> kUINT8, kUnsigned >

Public Types

using type = std::uint8_t

Public Static Attributes

static constexpr char name[] = "uint8"
static constexpr auto size = sizeof(type)
template<nvinfer1::DataType kDataType, bool kUnsigned>
struct DataTypeTraits<kDataType, kUnsigned, true>

Public Types

using type = typename DataTypeTraits<kDataType, kUnsigned, false>::type*

Public Static Attributes

static constexpr char name[] = "*"
static constexpr auto size = sizeof(type)
class BufferDataType
#include <iBuffer.h>

A wrapper around nvinfer1::DataType that provides a support for pointer types.

Public Functions

inline constexpr BufferDataType(nvinfer1::DataType dataType, bool _unsigned = false, bool pointer = false)
inline constexpr operator nvinfer1::DataType() const noexcept
inline constexpr nvinfer1::DataType getDataType() const noexcept
inline constexpr bool isPointer() const noexcept
inline constexpr bool isUnsigned() const
inline constexpr std::size_t getSize() const noexcept

Public Static Attributes

static constexpr auto kTrtPointerType = nvinfer1::DataType::kINT64

Private Members

nvinfer1::DataType mDataType
bool mUnsigned
bool mPointer
template<typename T, bool = false>
struct TRTDataType
#include <iBuffer.h>

For converting a C++ data type to a TensorRT data type.

template<>
struct TRTDataType<float>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kFLOAT
template<>
struct TRTDataType<half>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kHALF
template<> int8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8
template<> int32_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32
template<> uint32_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
template<> int64_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64
template<> uint64_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
template<>
struct TRTDataType<bool>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kBOOL
template<> uint8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8
template<>
struct TRTDataType<void*>

Public Static Attributes

static constexpr auto value = BufferDataType::kTrtPointerType
template<typename T>
struct TRTDataType<T*>

Public Static Attributes

static constexpr auto value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}

Private Static Attributes

static constexpr auto kUnderlyingType = BufferDataType{TRTDataType<T, false>::value}
class IBuffer

Subclassed by tensorrt_llm::runtime::ITensor

Public Types

using UniquePtr = std::unique_ptr<IBuffer>
using SharedPtr = std::shared_ptr<IBuffer>
using UniqueConstPtr = std::unique_ptr<IBuffer const>
using SharedConstPtr = std::shared_ptr<IBuffer const>
using DataType = nvinfer1::DataType

Public Functions

virtual void *data() = 0

Returns a pointer to underlying array.

virtual void const *data() const = 0

Returns a pointer to underlying array.

inline virtual void *data(std::size_t index)

Returns a pointer to the underlying array at a given element index.

inline virtual void const *data(std::size_t index) const

Returns a pointer to the underlying array at a given element index.

virtual std::size_t getSize() const = 0

Returns the size (in number of elements) of the buffer.

inline virtual std::size_t getSizeInBytes() const

Returns the size (in bytes) of the buffer.

virtual std::size_t getCapacity() const = 0

Returns the capacity of the buffer.

virtual DataType getDataType() const = 0

Returns the data type of the buffer.

virtual char const *getDataTypeName() const
virtual MemoryType getMemoryType() const = 0

Returns the memory type of the buffer.

virtual char const *getMemoryTypeName() const
virtual void resize(std::size_t newSize) = 0

Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

virtual void release() = 0

Releases the buffer. It will be reset to nullptr.

virtual ~IBuffer() = default
IBuffer(IBuffer const&) = delete

Not allowed to copy.

IBuffer &operator=(IBuffer const&) = delete

Not allowed to copy.

Public Static Functions

static UniquePtr slice(SharedPtr buffer, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying buffer. The view will have the same data type as buffer.

Parameters:
  • buffer – The buffer to view.

  • offset – The offset of the view.

  • size – The size of the view.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
static inline UniquePtr slice(SharedPtr buffer, std::size_t offset)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently resized.

Parameters:

tensor – The tensor to view.

Returns:

A view on the tensor.

static inline UniquePtr view(SharedPtr tensor, std::size_t size)

Returns a view on the underlying tensor with a different size.

Parameters:
  • tensor – The tensor to view.

  • size – The size of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, std::size_t size)
static UniquePtr wrap(void *data, DataType type, std::size_t size, std::size_t capacity)

Wraps the given data in an IBuffer. The IBuffer will not own the underlying data and cannot be resized beyond capacity.

Parameters:
  • data – The data to wrap.

  • type – The data type of the data.

  • size – The size of the buffer.

  • capacity – The capacity of the buffer.

Returns:

An IBuffer.

static inline UniquePtr wrap(void *data, DataType type, std::size_t size)
template<typename T>
static inline UniquePtr wrap(T *data, std::size_t size, std::size_t capacity)
template<typename T>
static inline UniquePtr wrap(T *data, std::size_t size)
template<typename T>
static inline UniquePtr wrap(std::vector<T> &v)
static MemoryType memoryType(void const *data)

Determine the memory type of a pointer.

Protected Functions

IBuffer() = default
inline std::size_t toBytes(std::size_t size) const

Returns an array index or size in bytes.

template<typename T>
class BufferRange : public tensorrt_llm::common::ArrayView<T>

Public Types

using Base = tensorrt_llm::common::ArrayView<T>

Public Functions

inline BufferRange(T *data, size_type size)
inline explicit BufferRange(IBuffer &buffer)

iGptDecoderBatch.h

namespace tensorrt_llm
namespace runtime
class IGptDecoderBatch : public virtual tensorrt_llm::runtime::IStatefulGptDecoder
#include <iGptDecoderBatch.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::GptDecoderBatch

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>
using TensorPtr = std::shared_ptr<ITensor>
using TokenPtr = std::unique_ptr<decoder_batch::Token const>

Public Functions

virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) = 0

Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::Token const &token) = 0

Wait for the call to forwardAsync associated with a token to complete.

inline virtual void forward(decoder_batch::Output &output, decoder_batch::Input const &input)

Run one step for all requests and wait for completion on the host.

virtual TensorPtr getOutputIds(SizeType batchIdx) const = 0
Parameters:

batchIdx – index of the batch

Returns:

[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

virtual CudaEvent finalize(SizeType batchIdx) const = 0

Gather final beam search results for request batchIdx. Result will only be available after event returned.

virtual std::vector<bool> getFinished() const = 0
Returns:

[batchSize (actual)], marks finished requests (per batch)

virtual TensorPtr getCumLogProbs() const = 0
Returns:

[batchSize, beamWidth], cumulative log probabilities (per beam), on gpu

virtual TensorPtr getCumLogProbs(SizeType batchIdx) const = 0
Returns:

[beamWidth], cumulative log probabilities (per beam) for request batchIdx, on gpu

virtual TensorPtr getLogProbs() const = 0
Returns:

[batchSize, beamWidth, maxSeqLen], log probabilities (per beam), on gpu

virtual TensorPtr getLogProbs(SizeType batchIdx) const = 0
Returns:

[beamWidth, maxSeqLen], cumulative log probabilities (per beam) for request batchIdx, on gpu

virtual TensorPtr getParentIds() const = 0
virtual std::vector<SizeType> getNbSteps() const = 0
virtual void newRequests(std::vector<SizeType> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs) = 0

Initialize batched decoder at seqSlots with a new requests.

virtual TensorPtr getNextDraftTokens() const = 0
Returns:

[batchSize, maxTokensPerStep-1], predicted draft tokens for next step, on gpu

virtual TensorPtr getMedusaAcceptedLengthsCumSum() const = 0
Returns:

[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu

virtual TensorPtr getMedusaAcceptedPackedPaths() const = 0
Returns:

[batchSize * maxMedusaHeads], accepted paths packed into continuous tensor, on gpu

Protected Functions

IGptDecoderBatch() = default
namespace decoder_batch

Typedefs

using Output = decoder::Output
class Request

Public Types

using ConstTensorPtr = ITensor::SharedConstPtr
using TensorPtr = ITensor::SharedPtr
using BufferPtr = IBuffer::SharedPtr

Public Functions

inline explicit Request(ConstTensorPtr ids, SizeType inputLen, std::optional<SizeType> maxNewTokens = std::nullopt, std::optional<SizeType> endId = std::nullopt)

Public Members

ConstTensorPtr ids
SizeType inputLen
std::optional<SizeType> maxNewTokens
std::optional<SizeType> endId
BufferPtr draftTokens
std::optional<TensorPtr> draftLogits
TensorPtr embeddingBias
TensorPtr badWordsList
TensorPtr stopWordsList
bool computeCumLogProbs
bool computeLogProbs
SizeType generatedTokensPerEngineStep
TensorPtr medusaPaths
TensorPtr medusaTreeIds
class Input

Public Types

using TensorConstPtr = ITensor::SharedConstPtr
using TensorPtr = ITensor::SharedPtr

Public Functions

inline explicit Input(std::vector<TensorConstPtr> const &logits, std::vector<bool> const &active)
inline explicit Input(std::vector<TensorConstPtr> const &logits)
inline explicit Input(std::vector<TensorPtr> const &logits, std::vector<bool> const &active)
inline explicit Input(std::vector<TensorPtr> const &logits)

Public Members

std::vector<TensorConstPtr> logits
std::vector<bool> active
TensorConstPtr cacheIndirection
std::vector<std::vector<TensorConstPtr>> medusaLogits
class Token

Public Functions

inline explicit Token(CudaEvent &&event, std::vector<bool> const &active)

Public Members

CudaEvent event
std::vector<bool> active

iStatefulGptDecoder.h

namespace tensorrt_llm
namespace runtime
class IStatefulGptDecoder
#include <iStatefulGptDecoder.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::IGptDecoderBatch

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>
using TensorPtr = std::shared_ptr<ITensor>

Public Functions

virtual void setup(DecodingMode const &mode, SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, SizeType maxTokensPerStep, bool fusedDecoder, nvinfer1::DataType dtype, GptModelConfig const &modelConfig) = 0

Setup the decoder before calling forward(), also calls reshapeBuffers.

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) = 0

Initialize the decoder with new batch of inputs.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0

Run one step for all requests without blocking the host thread.

virtual void forwardSync() = 0

Wait for the last call to forwardAsync to complete.

inline virtual void forward(decoder::Output &output, decoder::Input const &input)

Run one step for all requests.

virtual void finalize() const = 0

Gather final beam search results for all requests.

virtual TensorPtr getOutputIds() const = 0
Returns:

[batchSize, beamWidth, maxSequenceLength], all token ids, on gpu

virtual TensorPtr getCumLogProbs() const = 0
Returns:

[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

virtual TensorPtr getLogProbs() const = 0
Returns:

[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

virtual TensorPtr getNewTokens(SizeType iter = 0) const = 0

Get tokens generated in one step of last forward pass.

Parameters:

iter – The iteration within [0; maxTokensPerStep) for which to get the tokens

Returns:

[batchSize, beamWidth], tokens generated in iter (per beam), on gpu

virtual TensorPtr getAllNewTokens() const = 0

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:

[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

virtual TensorPtr getNbFinished() const = 0
Returns:

[1], number of finished sequences, in pinned host memory

virtual ~IStatefulGptDecoder() = default

Protected Functions

IStatefulGptDecoder() = default
namespace decoder
class Input

Public Types

using TensorPtr = std::shared_ptr<ITensor const>

Public Functions

inline explicit Input(TensorPtr logits)

Public Members

TensorPtr logits
TensorPtr cacheIndirection
class Output

Public Types

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

Output() = default

Public Members

TensorPtr cacheIndirection
TensorPtr sequenceLengths

iTensor.h

namespace nvinfer1
namespace tensorrt_llm
namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims)

Utility function to print a shape.

std::ostream &operator<<(std::ostream &output, ITensor const &tensor)

Utility function to print a tensor with its shape.

class ITensor : public virtual tensorrt_llm::runtime::IBuffer

Public Types

using UniquePtr = std::unique_ptr<ITensor>
using SharedPtr = std::shared_ptr<ITensor>
using UniqueConstPtr = std::unique_ptr<ITensor const>
using SharedConstPtr = std::shared_ptr<ITensor const>
using Shape = nvinfer1::Dims
using DimType = std::remove_reference_t<decltype(Shape::d[0])>

Public Functions

~ITensor() override = default
virtual Shape const &getShape() const = 0

Returns the tensor dimensions.

virtual void reshape(Shape const &dims) = 0

Sets the tensor dimensions. The new size of the tensor will be volume(dims)

inline virtual void resize(std::size_t newSize) override

Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

ITensor(ITensor const&) = delete

Not allowed to copy.

ITensor &operator=(ITensor const&) = delete

Not allowed to copy.

inline void squeeze(SizeType dim)

Removes the given unit dimensions from this tensor.

inline void unsqueeze(SizeType dim)

Adds a unit dimension at the specified position.

inline bool shapeEquals(Shape const &other) const
inline bool shapeEquals(std::initializer_list<SizeType> const &other) const
template<typename T>
inline bool shapeEquals(T const *dims, SizeType count) const

Public Static Functions

static inline std::int64_t volume(Shape const &dims)

Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.

static inline std::size_t volumeNonNegative(Shape const &shape)

Returns the volume of the dimensions. Throws if d.nbDims < 0.

static Shape squeeze(Shape const &shape, SizeType dim)

Removes the given unit dimension from shape.

Parameters:
  • shape – The shape to squeeze.

  • dim – The dimension that should be removed (“squeezed”).

Returns:

A new shape without the unit dimension.

static Shape unsqueeze(Shape const &shape, SizeType dim)

Add a unit dimension to shape at the specified position.

Parameters:
  • shape – The shape to unsqueeze.

  • dim – The dimension where unit dimension should be added.

Returns:

A new shape with the added unit dimension.

static UniquePtr slice(SharedPtr tensor, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.

Parameters:
  • tensor – The tensor to view.

  • offset – The offset of the view w.r.t. dimension 0 of the tensor.

  • size – The size of the view w.r.t. dimension 0 of the tensor.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
static inline UniquePtr slice(SharedPtr tensor, std::size_t offset)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
static UniquePtr view(IBuffer::SharedPtr buffer, Shape const &dims)

Returns a view on the underlying buffer (or tensor) with the given shape.

Parameters:
  • tensor – The tensor to view.

  • shape – The shape of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)
static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently reshaped.

Parameters:

tensor – The tensor to view.

Returns:

A view on the tensor.

static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)

Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.

Parameters:
  • data – The data to wrap.

  • type – The data type of the data.

  • shape – The shape of the tensor.

  • capacity – The capacity of the buffer.

Returns:

An ITensor.

static inline UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape)
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape)
template<typename T>
static inline UniquePtr wrap(std::vector<T> &v, Shape const &shape)
static Shape makeShape(std::initializer_list<SizeType> const &dims)

A convenience function to create a tensor shape with the given dimensions.

static std::string toString(Shape const &dims)

A convenience function for converting a tensor shape to a string.

static inline bool shapeEquals(Shape const &lhs, Shape const &rhs)

A convenience function to compare shapes.

template<typename T>
static inline bool shapeEquals(Shape const &lhs, T const *dims, SizeType count)

A convenience function to compare shapes.

Protected Functions

ITensor() = default

Protected Static Functions

static inline DimType castSize(size_t newSize)

ipcUtils.h

namespace tensorrt_llm
namespace runtime

Functions

void setPeerAccess(WorldConfig const &worldConfig, bool enable = true)
class IpcMemory

Public Types

using TensorPtr = ITensor::SharedPtr

Public Functions

IpcMemory(WorldConfig const &worldConfig, std::size_t bufferSize)
~IpcMemory()
inline std::vector<void*> const &getCommPtrsTensor() const

Public Static Attributes

static constexpr size_t FLAGS_SIZE = kernels::MAX_ALL_REDUCE_BLOCKS * sizeof(uint32_t)

Private Functions

void allocateIpcMemory()
void destroyIpcMemory()

Private Members

WorldConfig mWorldConfig
std::vector<void*> mCommPtrs
std::size_t mBufferSize
void *mBufferPtr = {nullptr}

memoryCounters.h

namespace tensorrt_llm
namespace runtime
class MemoryCounters

Public Types

using SizeType = std::size_t
using DiffType = std::ptrdiff_t

Public Functions

MemoryCounters() = default
inline SizeType getGpu() const
inline SizeType getCpu() const
inline SizeType getPinned() const
inline SizeType getUVM() const
inline DiffType getGpuDiff() const
inline DiffType getCpuDiff() const
inline DiffType getPinnedDiff() const
inline DiffType getUVMDiff() const
template<MemoryType T>
inline void allocate(SizeType size)
void allocate(MemoryType memoryType, SizeType size)
template<MemoryType T>
inline void deallocate(SizeType size)
void deallocate(MemoryType memoryType, SizeType size)
std::string toString() const

Public Static Functions

static MemoryCounters &getInstance()
static std::string bytesToString(SizeType bytes, int precision = 2)
static std::string bytesToString(DiffType bytes, int precision = 2)

Private Members

std::atomic<SizeType> mGpu = {}
std::atomic<SizeType> mCpu = {}
std::atomic<SizeType> mPinned = {}
std::atomic<SizeType> mUVM = {}
std::atomic<DiffType> mGpuDiff = {}
std::atomic<DiffType> mCpuDiff = {}
std::atomic<DiffType> mPinnedDiff = {}
std::atomic<DiffType> mUVMDiff = {}

promptTuningParams.h

namespace tensorrt_llm
namespace runtime
template<typename TTensor>
class GenericPromptTuningParams

Public Types

using TensorPtr = TTensor
using SizeType = tensorrt_llm::runtime::SizeType

Public Functions

inline explicit GenericPromptTuningParams(TensorPtr embeddingTable = TensorPtr(), TensorPtr tasks = TensorPtr(), TensorPtr vocabSize = TensorPtr())

Public Members

TensorPtr embeddingTable
TensorPtr tasks
TensorPtr vocabSize
std::vector<bool> promptTuningEnabled
class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<ITensor::SharedPtr>

Public Types

using TensorPtr = ITensor::SharedPtr
using SizeType = GenericPromptTuningParams::SizeType

Public Functions

inline explicit PromptTuningParams(TensorPtr embeddingTable = nullptr, TensorPtr tasks = nullptr, TensorPtr vocabSize = nullptr)
void fillTasksTensor(TensorPtr tasksHost, const SizeType batchSize, const SizeType numContextRequests, std::vector<SizeType> const &reqBeamWidths, std::vector<SizeType> const &reqPromptLengths, BufferManager const &manager, bool packedInput)

samplingConfig.h

Defines

SET_FROM_OPTIONAL(varName, VarName, VarType)
namespace tensorrt_llm
namespace runtime
class SamplingConfig

Public Functions

inline explicit SamplingConfig(SizeType beamWidth = 1)
inline explicit SamplingConfig(std::vector<SamplingConfig> const &configs)
inline explicit SamplingConfig(executor::SamplingConfig const &samplingConfig, std::optional<executor::SpeculativeDecodingConfig> const &specDecodingConfig)
inline bool operator==(SamplingConfig const &other) const

Public Members

SizeType beamWidth
OptVec<FloatType> temperature
OptVec<SizeType> minLength
OptVec<FloatType> repetitionPenalty
OptVec<FloatType> presencePenalty
OptVec<FloatType> frequencyPenalty
OptVec<SizeType> topK
OptVec<FloatType> topP
OptVec<uint64_t> randomSeed
OptVec<FloatType> topPDecay
OptVec<FloatType> topPMin
OptVec<SizeType> topPResetIds
OptVec<FloatType> beamSearchDiversityRate
OptVec<FloatType> lengthPenalty
OptVec<SizeType> earlyStopping
OptVec<FloatType> draftAcceptanceThreshold
OptVec<std::vector<runtime::SizeType>> topKMedusaHeads
std::optional<bool> normalizeLogProbs

Private Types

using FloatType = float
template<typename T>
using OptVec = std::optional<std::vector<T>>
template<typename T>
using Vec = std::vector<T>

Private Static Functions

template<typename T>
static inline OptVec<T> fuseValues(std::vector<SamplingConfig> const &configs, std::function<OptVec<T>(SizeType ci)> accessor)

tllmLogger.h

namespace tensorrt_llm
namespace runtime
class TllmLogger : public nvinfer1::ILogger

Public Functions

void log(Severity severity, nvinfer1::AsciiChar const *msg) noexcept override
Severity getLevel()
void setLevel(Severity level)

worldConfig.h

namespace tensorrt_llm
namespace runtime
class WorldConfig

Public Functions

explicit WorldConfig(SizeType tensorParallelism = 1, SizeType pipelineParallelism = 1, SizeType rank = 0, SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<std::vector<SizeType>> const &deviceIds = std::nullopt)
inline constexpr SizeType getSize() const noexcept
inline constexpr SizeType getTensorParallelism() const noexcept
inline constexpr bool isTensorParallel() const noexcept
inline constexpr SizeType getPipelineParallelism() const noexcept
inline constexpr bool isPipelineParallel() const noexcept
inline constexpr SizeType getRank() const noexcept
inline constexpr SizeType getGpusPerNode() const noexcept
inline SizeType getGpusPerGroup() const noexcept
inline SizeType getDevice() const noexcept
inline constexpr SizeType getPipelineParallelRank() const noexcept
inline constexpr SizeType getTensorParallelRank() const noexcept
inline constexpr bool isFirstPipelineParallelRank() const noexcept
inline constexpr bool isLastPipelineParallelRank() const noexcept

Is my rank the last rank in its pipeline?

inline constexpr SizeType getLastRank() const noexcept
std::vector<SizeType> getPipelineParallelGroup() const
bool validMpiConfig() const

Public Static Functions

static WorldConfig mpi(SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType> tensorParallelism = std::nullopt, std::optional<SizeType> pipelineParallelism = std::nullopt, std::optional<std::vector<SizeType>> const &deviceIds = std::nullopt)

Public Static Attributes

static constexpr SizeType kDefaultGpusPerNode = 8

Private Members

SizeType mTensorParallelism
SizeType mPipelineParallelism
SizeType mRank
SizeType mGpusPerNode
std::vector<SizeType> mDeviceIds

decodingMode.h

namespace tensorrt_llm
namespace runtime
class DecodingMode

Public Types

using UnderlyingType = uint8_t

Public Functions

inline constexpr bool isNone()
inline constexpr bool isTopK()
inline constexpr bool isTopP()
inline constexpr bool isTopKorTopP()
inline constexpr bool isTopKandTopP()
inline constexpr bool isBeamSearch()
inline constexpr bool isMedusa()
inline bool operator==(DecodingMode const &other) const

Public Static Functions

static inline constexpr auto None()
static inline constexpr auto TopK()
static inline constexpr auto TopP()
static inline constexpr auto TopKTopP()
static inline constexpr auto BeamSearch()
static inline constexpr auto Medusa()

Private Functions

inline constexpr DecodingMode(UnderlyingType state)
inline constexpr bool anyBitSet(UnderlyingType bits) const
inline constexpr bool allBitSet(UnderlyingType bits) const

Private Members

UnderlyingType mState = {}

Private Static Attributes

static constexpr UnderlyingType kNone = {0}
static constexpr UnderlyingType kTopK = {1u << 0}
static constexpr UnderlyingType kTopP = {1u << 1}
static constexpr UnderlyingType kBeamSearch = {1u << 2}
static constexpr UnderlyingType kMedusa = {1u << 3}
static constexpr UnderlyingType kTopKTopP = {kTopK | kTopP}

loraCache.h

namespace tensorrt_llm
namespace runtime

Functions

std::string to_string(LoraCache::TaskLayerModuleConfig const &v)
std::ostream &operator<<(std::ostream &os, LoraCache::TaskLayerModuleConfig const &v)
class LoraCachePageManager
#include <loraCache.h>

Holds memory of lora cache pages, and manages allocation and freeing of whole pages. Memory is pre-allocated either on the host or device

Note that this class is not thread safe

Public Types

using TensorPtr = ITensor::SharedPtr

Public Functions

LoraCachePageManager(LoraCachePageManagerConfig const &config, BufferManager const &bufferManager)
Parameters:
std::optional<std::vector<std::size_t>> claimPages(SizeType numPages)

claim pages

Parameters:

numPages[in] number of pages to claim

Returns:

a tuple, where the first values is a boolean indicating whether pages were claimed. If the first value is true the second value will have a list of pageIds

SizeType numAvailablePages() const

get number of available (free) pages in manager

Returns:

number of free pages in manager

void releasePages(std::vector<std::size_t> const &pages)

release given pages

Parameters:

pages[in] list of pages to release (free)

ITensor::SharedConstPtr blockPtr(SizeType blockIdx) const

return pointer to given page block

Parameters:

blockIdx;[in]

Returns:

&#8212; pointer to page block

ITensor::SharedConstPtr pagePtr(std::size_t pageIdx) const

return pointer to given page

Parameters:

pageIdx[in]

Returns:

&#8212; const pointer to page

ITensor::SharedPtr mutablePagePtr(std::size_t pageIdx)

return pointer to given page

Parameters:

pageIdx[in]

Returns:

&#8212; mutable pointer to page

Private Functions

void initialize(BufferManager const &bufferManager)

Private Members

std::vector<TensorPtr> mPageBlocks
std::deque<std::size_t> mFreePageIds
std::vector<std::uint8_t> mIsPageFree
LoraCachePageManagerConfig const mConfig
class LoraCache
#include <loraCache.h>

LoraCache

Caches LoRA weights with LRU eviction policy.

Tasks put in the cache are marked in progress and can not be evicted, until they are marked done.

A cache page holds a optimally sized LoRA. A page is of size [numSlots x pageWidth] An optimally size LoRA is on that has the configured optimalAdapterSize.

Conceptually a slot corresponds to a r=1, 1-layer, 1-module set of in/out weights. Page width is set to the number of weights in smallest module.

The number of slots per page is then ceilDiv(num weights in optimally sized LoRA, num weights in smallest module)

Cache pages are allocated on one or more blocks

Public Types

using TensorPtr = ITensor::SharedPtr
using TaskIdType = std::uint64_t
using TaskLayerModuleConfigListPtr = std::shared_ptr<std::vector<TaskLayerModuleConfig>>

Public Functions

LoraCache(LoraCachePageManagerConfig const &pageManagerConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, BufferManager const &bufferManager)

param[in] pageManagerConfig: a LoraCachePageManagerConfig param[in] modelConfig: a GptModelConfig param[in] worldConfig: a WorldConfig param[in] bufferManager: a BufferManager only used to allocate page blocks

void put(TaskIdType taskId, TensorPtr weights, TensorPtr config, bool load = true)

put a task in the cache, and claim pages for it, and optionally load task weights.

Parameters:
  • taskId[in] the task id

  • weights[in] lora weights tensor

  • config[in] lora config tensor

  • load[in] if true load weights before returning, otherwise do not

void loadWeights(TaskIdType taskId, TensorPtr weights, TensorPtr config)

load task weights. This method must be called after put. It is designed to be called asynchronously after put returns with load = false

Parameters:
  • taslId[in] the task id

  • weights[in] lora weights tensor

  • config[in] lora config tensor

inline bool isLoaded(TaskIdType taskId) const
Parameters:

taskId[in] the task id

Returns:

&#8212; true if task is loaded (weights are in place) and false otherwise

bool isDone(TaskIdType taskId) const
Parameters:

taskId[in] the task id

Returns:

&#8212; true if task is marked done and can be evicted

inline bool has(TaskIdType taskId) const
Parameters:

taskId[in] the task id

Returns:

&#8212; true if task is in the cache (not necessarily loaded) and false otherwise

std::shared_ptr<std::vector<TaskLayerModuleConfig>> get(TaskIdType taskId)
Parameters:

taskId[in] the task id

Returns:

&#8212; list of Value objects with pointers to task weights

void bump(TaskIdType taskId)

bump task and make it the most recently used

Parameters:

taskId[in] the task id

void markTaskDone(TaskIdType taskId)

mark task done meaning it can be evicted

Parameters:

taskId[in] the task id

void markAllDone()

mark all tasks in cache done

SizeType determineNumPages(TaskIdType taskId) const
Parameters:

taskId[in] the taskid

Returns:

&#8212; number of pages needed to store the given task

SizeType determineNumPages(TensorPtr config) const
Parameters:

config[in] lora config tensor

Returns:

&#8212; number of pages needed to store the task configured with config tensor

bool fits(TensorPtr config) const
Parameters:

config[in] a lora config tensor

Returns:

&#8212; true in task fits in cache false otherwise

void copyTask(TaskIdType taskId, LoraCache &deviceCache, bool markDone = false)

copy task to another cache. Caches must have the same page size.

Parameters:
  • taskId[in] the task id to copy

  • otherCache[in] the LoraCache to move the task to

  • markDone[in] mark the copied task done as it’s copied

SizeType getNumPages() const
Returns:

&#8212; total number of pages allocated to cache (used or not)

ITensor::SharedConstPtr getPagePtr(size_t pageId) const
Parameters:

pageId[in] the page id

Returns:

&#8212; const pointer to page

Public Static Functions

static std::vector<LoraCache::TaskLayerModuleConfig> copyToPages(TensorPtr weights, TensorPtr config, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::unordered_map<SizeType, LoraModule> moduleIdToModel, BufferManager const &manager, std::vector<TensorPtr> const &pages, std::vector<std::size_t> const &pageIds)

Copy task weights to cache pages.

Parameters:
  • weights[in] task weights

  • config[in] task config tensor

  • modelConfig[in] a GptModelConfig

  • worldConfig[in] a WorldConfig

  • modelIdToModel[in] map from lora module id to LoraModule

  • manager[in] a BufferManager the manager to use to perform the copies

  • pages[out] list of page tensors to copy weights to

  • pageIds[in] page ids for the pages

Returns:

&#8212; list of cache Values objects

static void splitTransposeCpu(ITensor &output, ITensor const &input, SizeType tpSize, SizeType tpRank)

splits second dim of input into tpSize parts and writes the tpRank split to output

Parameters:
  • output[out] output tensor

  • input[in] input tensor

  • tpSize[in] number of splits

  • tpRank[in] the split to write to output

Private Types

enum ValueStatus

Values:

enumerator kVALUE_STATUS_MISSING
enumerator kVALUE_STATUS_PROCESSING
enumerator kVALUE_STATUS_LOADED
using TaskValuePtr = std::shared_ptr<TaskValue>

Private Functions

void loadWeights(TaskValue &cacheValue, TensorPtr weights, TensorPtr config)
void bumpTaskInProgress(TaskIdType taskId)
ValueStatus getStatus(TaskIdType taskId) const
std::vector<std::size_t> claimPagesWithEvict(SizeType numPages)

claim numPages, evicting tasks if needed

Parameters:

numPages[in] number of pages to claim

Throws:

std::runtime_error – if all pages cannot be claimed

Returns:

&#8212; list of page ids

std::map<size_t, std::pair<size_t, SizeType>> copyTaskMapPages(TaskValue &targetTaskValue, TaskValue const &sourceTaskValue, std::vector<size_t> const &targetPageIds, LoraCache const &targetCache)

Internal helper method used inside copyTask. Not thread safe on its own

Private Members

LoraCachePageManagerConfig mPageManagerConfig
GptModelConfig mModelConfig
WorldConfig mWorldConfig
mutable std::mutex mPagesMutex
std::unique_ptr<LoraCachePageManager> mCachePageManager
mutable std::mutex mCacheMutex
std::unordered_map<TaskIdType, TaskValuePtr> mCacheMap
std::list<TaskIdType> mInProgressTasks
std::list<TaskIdType> mDoneTasks
std::vector<std::unique_ptr<BufferManager>> mDeviceBufferManagers
std::unique_ptr<BufferManager> mBufferManager
std::unordered_map<SizeType, LoraModule> mModuleIdToModule

Private Static Functions

template<typename T>
static void splitTransposeCpuInner(ITensor &output, ITensor const &input, SizeType tpSize, SizeType tpRank)
struct TaskLayerModuleConfig
#include <loraCache.h>

Contains information on a single layer / module. A list of these configs is associated with each task and can be used to populate runtime tensors.

Public Functions

std::string toString() const
bool operator==(LoraCache::TaskLayerModuleConfig const &o) const

Public Members

std::size_t pageId
SizeType slotIdx
SizeType inSize
SizeType outSize
SizeType moduleId
SizeType layerId
SizeType adapterSize
SizeType numSlots
std::int64_t weightsInPointer
std::int64_t weightsOutPointer
struct TaskValue

Holds configuration and state for a single task.

Public Functions

TaskValue() = delete
~TaskValue() = default
inline TaskValue(std::vector<std::size_t> const &pageIds, TaskLayerModuleConfigListPtr const &configs, std::list<TaskIdType>::iterator it, bool inProgress, bool loaded, bool done, bool loadInProgress = false)
inline TaskValue(TaskValue &&o) noexcept
inline TaskValue &operator=(TaskValue &&o)

Public Members

std::vector<std::size_t> pageIds
TaskLayerModuleConfigListPtr configs
std::list<TaskIdType>::iterator it
bool inProgress
bool loaded
bool done

Marks a task a done. This is used to mark a task as done during loading. if done=true at the end of loading (end of put, loadweights, or copyTask) the task will be marked as done

bool loadInProgress

Indicates weights are loading either in put or loadWeights This is used to block concurrent loadWeights calls for the same task.

loraCachePageManagerConfig.h

namespace tensorrt_llm
namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &os, LoraCachePageManagerConfig const &c)
inline std::string to_string(LoraCachePageManagerConfig const &c)
class LoraCachePageManagerConfig
#include <loraCachePageManagerConfig.h>

Configuration for LoraCachePageManager

See LoraCache docs for description of pages, slots, and page blocks.

Public Functions

inline explicit constexpr LoraCachePageManagerConfig(runtime::MemoryType memType, nvinfer1::DataType dType, SizeType totalNumPages, SizeType maxPagesPerBlock, SizeType slotsPerPage, SizeType pageWidth, SizeType numCopyStreams)
inline constexpr runtime::MemoryType getMemoryType() const noexcept
inline constexpr void setMemoryType(runtime::MemoryType const &memoryType) noexcept
inline constexpr nvinfer1::DataType getDataType() const noexcept
inline constexpr void setDataType(nvinfer1::DataType const &dtype) noexcept
inline constexpr SizeType getTotalNumPages() const noexcept
inline constexpr void setTotalNumPage(SizeType const &totalNumPages) noexcept
inline constexpr SizeType getMaxPagesPerBlock() const noexcept
inline constexpr void setMaxPagesPerBlock(SizeType const &maxPagesPerBlock) noexcept
inline constexpr SizeType getSlotsPerPage() const noexcept
inline constexpr void setSlotsPerPage(SizeType const &slotsPerPage) noexcept
inline constexpr SizeType getPageWidth() const noexcept
inline constexpr void setPageWidth(SizeType const &pageWidth) noexcept
inline constexpr bool getInitToZero() const noexcept
inline constexpr void setInitToZero(bool initToZero) noexcept
inline constexpr SizeType getNumCopyStreams() const noexcept
inline constexpr void setNumCopyStreams(SizeType numCopyStreams) noexcept

Private Members

runtime::MemoryType mMemoryType
nvinfer1::DataType mDataType
SizeType mTotalNumPages
SizeType mMaxPagesPerBlock
SizeType mSlotsPerPage
SizeType mPageWidth
SizeType mNumCopyStreams = 1
bool mInitToZero

loraModule.h

namespace tensorrt_llm
namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &output, LoraModule const &module)
class LoraModule

Public Types

enum class ModuleType : SizeType

Values:

enumerator kINVALID
enumerator kATTN_QKV
enumerator kATTN_Q
enumerator kATTN_K
enumerator kATTN_V
enumerator kATTN_DENSE
enumerator kMLP_H_TO_4H
enumerator kMLP_4H_TO_H
enumerator kMLP_GATE
enumerator kCROSS_ATTN_QKV
enumerator kCROSS_ATTN_Q
enumerator kCROSS_ATTN_K
enumerator kCROSS_ATTN_V
enumerator kCROSS_ATTN_DENSE
using TensorPtr = ITensor::SharedPtr

Public Functions

inline explicit constexpr LoraModule(ModuleType const &t, SizeType inDim, SizeType outDim, bool inDimFirst, bool outDimFirst, SizeType inTpSplitDim, SizeType outTpSplitDim) noexcept
inline explicit constexpr LoraModule() noexcept
explicit constexpr LoraModule(LoraModule const &o) = default
constexpr LoraModule &operator=(LoraModule const &o) = default
inline constexpr SizeType flattenedInOutSize(SizeType adapterSize) const noexcept
inline constexpr SizeType inSize(SizeType adapterSize) const noexcept
inline constexpr SizeType outSize(SizeType adapterSize) const noexcept
inline constexpr SizeType localInSize(SizeType adapterSize, SizeType tpSize) const noexcept
inline constexpr SizeType localOutSize(SizeType adapterSize, SizeType tpSize) const noexcept
inline constexpr SizeType localInDim(SizeType tpSize) const noexcept
inline constexpr SizeType localOutDim(SizeType tpSize) const noexcept
inline constexpr SizeType localInAdapterSize(SizeType adapterSize, SizeType tpSize) const noexcept
inline constexpr SizeType localOutAdapterSize(SizeType adapterSize, SizeType tpSize) const noexcept
inline constexpr SizeType localInOutSize(SizeType adapterSize, SizeType tpSize) const noexcept
inline constexpr SizeType value() const noexcept
inline constexpr std::string_view name() const noexcept
inline constexpr SizeType inDim() const noexcept
inline constexpr SizeType outDim() const noexcept
inline constexpr bool inDimFirst() const noexcept
inline constexpr bool outDimFirst() const noexcept
inline constexpr SizeType inTpSplitDim() const noexcept
inline constexpr SizeType outTpSplitDim() const noexcept

Public Static Functions

static std::vector<LoraModule> createLoraModules(std::vector<std::string> const &loraModuleNames, SizeType hiddenSize, SizeType mlpHiddenSize, SizeType numAttentionHeads, SizeType numKvAttentionHeads, SizeType attentionHeadSize, SizeType tpSize)
static inline constexpr ModuleType toModuleType(std::string_view const &name)
static inline constexpr std::string_view toModuleName(ModuleType t) noexcept
static inline constexpr std::string_view toModuleName(SizeType id)

Private Members

ModuleType mType
SizeType mInDim
SizeType mOutDim
bool mInDimFirst
bool mOutDimFirst
SizeType mInTpSplitDim
SizeType mOutTpSplitDim