Runtime

bufferManager.h

namespace tensorrt_llm
namespace runtime
class BufferManager
#include <bufferManager.h>

A helper class for managing memory on host and device.

Public Types

using IBufferPtr = IBuffer::UniquePtr
using ITensorPtr = ITensor::UniquePtr
using CudaStreamPtr = std::shared_ptr<CudaStream>

Public Functions

explicit BufferManager(CudaStreamPtr stream)

Construct a BufferManager.

Parameters:

cudaStream[in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).

IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an IBuffer of the given size on the GPU.

ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an ITensor of the given dimensions on the GPU.

IBufferPtr managed(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an IBuffer of the given size in UVM.

ITensorPtr managed(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an ITensor of the given dimensions in UVM.

IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an IBuffer of the given size and memory type.

ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an ITensor of the given dimensions and memory type.

inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const

Create an empty IBuffer of the given memory type. It may be resized later.

inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const

Create an empty ITensor of the given memory type. It may be reshaped later.

void setZero(IBuffer &buffer) const

Set the contents of the given buffer to zero.

void copy(void const *src, IBuffer &dst, MemoryType srcType) const

Copy src to dst.

void copy(IBuffer const &src, void *dst, MemoryType dstType) const

Copy src to dst.

inline void copy(void const *src, IBuffer &dst) const

Copy src to dst.

inline void copy(IBuffer const &src, void *dst) const

Copy src to dst.

void copy(IBuffer const &src, IBuffer &dst) const

Copy src to dst.

IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const

Copy src into a new IBuffer with a potentially different memory type.

ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const

Copy src into a new ITensor with a potentially different memory type.

template<typename T>
inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const

Copy src into a new IBuffer with a potentially different memory type.

template<typename T>
inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const

Copy src into a new ITensor with a potentially different memory type.

template<typename T>
inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const

Copy src into a new ITensor with a potentially different memory type.

CudaStream const &getStream() const

Get the underlying cuda stream.

std::size_t memoryPoolReserved() const

The current size of the memory reserved by the memory pool.

std::size_t memoryPoolUsed() const

The current size of the memory used by the memory pool.

std::size_t memoryPoolFree() const

The current size of the memory free in the memory pool.

void memoryPoolTrimTo(std::size_t size)

Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.

Public Static Functions

static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an IBuffer of the given size on the CPU.

static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an ITensor of the given dimensions on the CPU.

static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned IBuffer of the given size on the CPU.

static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned ITensor of the given dimensions on the CPU.

static IBufferPtr pinnedPool(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.

static ITensorPtr pinnedPool(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.

Public Static Attributes

static constexpr auto kBYTE_TYPE = nvinfer1::DataType::kUINT8

Private Members

CudaStreamPtr mStream

Private Static Functions

static void initMemoryPool(int device)
static std::size_t memoryPoolReserved(int device)
static std::size_t memoryPoolUsed(int device)
static inline std::size_t memoryPoolFree(int device)
static void memoryPoolTrimTo(int device, std::size_t size)

common.h

namespace tensorrt_llm
namespace runtime

Typedefs

using SizeType = std::int32_t
using TokenIdType = std::int32_t
template<typename T>
using StringPtrMap = std::unordered_map<std::string, std::shared_ptr<T>>

cudaEvent.h

namespace tensorrt_llm
namespace runtime
class CudaEvent

Public Types

using pointer = cudaEvent_t

Public Functions

inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)

Creates a new cuda event. The event will be destroyed in the destructor.

Parameters:

flags – Flags for event creation. By default, event timing is disabled.

inline explicit CudaEvent(pointer event, bool ownsEvent = true)

Pass an existing cuda event to this object.

Parameters:
  • event – The event to pass to this object.

  • ownsEvent – Whether this object owns the event and destroys it in the destructor.

inline pointer get() const

Returns the event associated with this object.

inline void synchronize() const

Synchronizes the event.

Private Types

using element_type = std::remove_pointer_t<pointer>
using EventPtr = std::unique_ptr<element_type, Deleter>

Private Members

EventPtr mEvent
class Deleter

Public Functions

inline explicit Deleter(bool ownsEvent)
inline explicit Deleter()
inline constexpr void operator()(pointer event) const

Private Members

bool mOwnsEvent

cudaStream.h

namespace tensorrt_llm
namespace runtime
class CudaStream

Public Functions

inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)

Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.

Parameters:
  • flags – Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed.

  • priority – Priority of the stream. Lower numbers represent higher priorities. See ::cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.

inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)

Pass an existing cuda stream to this object.

Parameters:
  • stream – The stream to pass to this object.

  • device – The device on which the stream was created.

  • ownsStream – Whether this object owns the stream and destroys it in the destructor.

inline int getDevice() const

Returns the device on which the stream was created.

inline cudaStream_t get() const

Returns the stream associated with this object.

inline void synchronize() const

Synchronizes the stream.

inline void record(CudaEvent::pointer event) const

Record an event on the stream.

inline void record(CudaEvent const &event) const

Record an event on the stream.

inline void wait(CudaEvent::pointer event) const

Wait for an event.

inline void wait(CudaEvent const &event) const

Wait for an event.

Private Types

using StreamPtr = std::unique_ptr<std::remove_pointer_t<cudaStream_t>, Deleter>

Private Members

StreamPtr mStream
int mDevice = {-1}
class Deleter

Public Functions

inline explicit Deleter(bool ownsStream)
inline explicit Deleter()
inline constexpr void operator()(cudaStream_t stream) const

Private Members

bool mOwnsStream

decodingInput.h

namespace tensorrt_llm
namespace runtime
class DecodingInput

Public Types

using TensorPtr = std::shared_ptr<ITensor const>

Public Functions

inline DecodingInput(SizeType maxLength, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxBatchSize, TensorPtr logits, TensorPtr endIds)

Public Members

SizeType step
SizeType maxLength
SizeType maxAttentionWindow
SizeType sinkTokenLength
SizeType maxBatchSize
TensorPtr logits
TensorPtr endIds
TensorPtr finished
TensorPtr sequenceLimitLength
TensorPtr embeddingBias
TensorPtr lengths
TensorPtr badWordsList
TensorPtr stopWordsList
TensorPtr noRepeatNgramSize
TensorPtr batchSlots
TensorPtr cacheIndirection

decodingOutput.h

namespace tensorrt_llm
namespace runtime
class DecodingOutput

Public Types

using TensorPtr = ITensor::SharedPtr

Public Functions

inline explicit DecodingOutput(TensorPtr ids)

Public Members

TensorPtr ids
TensorPtr newTokensSteps
TensorPtr newTokens
std::vector<TensorPtr> newTokensVec
TensorPtr finished
TensorPtr finishedSum
TensorPtr logProbs
TensorPtr cumLogProbs
TensorPtr parentIds
TensorPtr lengths
TensorPtr cacheIndirection
BeamHypotheses beamHypotheses

Public Static Attributes

static constexpr float kNegativeInfinity = -1e20f
class BeamHypotheses

Public Functions

void empty(BufferManager &manager)
void reshape(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)
void release()
void init(BufferManager &manager, TokenIdType endId)
BeamHypotheses slice(SizeType batchIndex, SizeType size) const

Public Members

TensorPtr outputIdsTgt
TensorPtr sequenceLengthsTgt
TensorPtr cumLogProbs
TensorPtr normedScores
TensorPtr logProbs
TensorPtr minNormedScores
TensorPtr numBeams
TensorPtr isDone

generationInput.h

namespace tensorrt_llm
namespace runtime
template<typename TTensor, typename PromptTuningParams>
class GenericGenerationInput

Public Types

using TensorPtr = TTensor

Public Functions

inline explicit GenericGenerationInput(SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

Public Members

SizeType endId
SizeType padId
TensorPtr ids
TensorPtr lengths
bool packed
TensorPtr embeddingBias
TensorPtr badWordsList
TensorPtr stopWordsList
std::optional<SizeType> maxNewTokens
PromptTuningParams promptTuningParams
class GenerationInput : public tensorrt_llm::runtime::GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>

Public Types

using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
using TensorPtr = Base::TensorPtr

Public Functions

inline explicit GenerationInput(SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

generationOutput.h

namespace tensorrt_llm
namespace runtime
template<typename TTensor>
class GenericGenerationOutput

Public Types

using TensorPtr = TTensor
using Callback = std::function<void(TensorPtr const &ids, SizeType step, bool finished)>

Public Functions

inline explicit GenericGenerationOutput(TensorPtr ids, TensorPtr lengths)

Public Members

TensorPtr ids
TensorPtr lengths
TensorPtr cumLogProbs
TensorPtr logProbs
TensorPtr contextLogits
TensorPtr generationLogits
Callback onTokenGenerated
class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<ITensor::SharedPtr>

Public Types

using Base = GenericGenerationOutput<ITensor::SharedPtr>
using TensorPtr = Base::TensorPtr

Public Functions

inline explicit GenerationOutput(TensorPtr ids, TensorPtr lengths)

gptDecoder.h

namespace tensorrt_llm
namespace layers
namespace runtime
class IGptDecoder

Subclassed by tensorrt_llm::runtime::GptDecoder< T >

Public Functions

virtual ~IGptDecoder() = default
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType maxSequenceLength) = 0
virtual bool forward(DecodingOutput &output, DecodingInput const &input) = 0
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) = 0
virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager) = 0
virtual const SamplingConfig &getSamplingConfig() = 0

Public Static Functions

static void acceptDraftTokensByIds(const ITensor &targetTokenIds, const ITensor &draftTokenIds, const ITensor &contextLengths, const ITensor &numDraftTokens, ITensor &sequenceLengths, const ITensor &finishedVec, ITensor &finishedFinal, ITensor &finishedSum, BufferManager::CudaStreamPtr const &stream)
static void acceptDraftTokensByLogits(ITensor &draftLogits, const ITensor &targetLogits, ITensor &draftProbs, ITensor &targetProbs, const ITensor &numDraftTokens, ITensor &finished, SizeType vocabSize, SizeType vocabSizePadded, bool useRandomAcceptThreshold, float randomAcceptThreshold, curandState_t *curandState, BufferManager::CudaStreamPtr const &stream)
static inline std::unique_ptr<IGptDecoder> create(nvinfer1::DataType dtype, size_t maxBatchSize, size_t vocabSize, size_t vocabSizePadded, BufferManager::CudaStreamPtr const &stream)
template<typename T>
class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder

Public Types

using CudaStreamPtr = BufferManager::CudaStreamPtr
using TensorPtr = std::shared_ptr<ITensor>

Public Functions

GptDecoder(size_t maxBatchSize, size_t vocabSize, size_t vocabSizePadded, CudaStreamPtr const &stream)
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType maxSequenceLength) override
virtual bool forward(DecodingOutput &output, DecodingInput const &input) override
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) override
virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager) override
inline virtual const SamplingConfig &getSamplingConfig() override

Private Members

BufferManager mManager
std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer
TensorPtr mLogProbsTiled
SamplingConfig mSamplingConfig

gptDecoderBatch.h

namespace tensorrt_llm
namespace runtime
class GptDecoderBatch : public tensorrt_llm::runtime::IGptDecoderBatch
#include <gptDecoderBatch.h>

GPT decoder class with support for in-flight batching.

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>
using TensorPtr = ITensor::SharedPtr

Public Functions

GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream)
virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, SizeType maxTokensPerStep, nvinfer1::DataType dtype) override

Setup the decoder before calling forward()

virtual void newRequest(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig) override

Initialize the decoder at batchIdx with a new request.

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) override

Initialize the decoder with new batch of inputs.

virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override

Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::Token const &e) override

Wait for the call to forwardAsync associated with a token to complete.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override

Run one step for all requests without blocking the host thread.

virtual void forwardSync() override

Wait for the last call to forwardAsync to complete.

inline virtual std::vector<bool> getFinished() const override
Returns:

[batchSize], indicators of finished requests

inline virtual TensorPtr getOutputIds(SizeType batchIdx) const override
Parameters:

batchIdx – index of the batch

Returns:

[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

inline virtual TensorPtr getOutputIds() const override
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu

virtual CudaEvent finalize(SizeType batchIdx) const

Gather final beam search results for request batchIdx. Result will only be available after event returned.

virtual void finalize() const override

Gather final beam search results for all requests.

inline virtual TensorPtr getParentIds() const override
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu

inline virtual TensorPtr getCumLogProbs() const override
Returns:

[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getCumLogProbs(SizeType batchIdx) const
Returns:

[maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs() const override
Returns:

[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs(SizeType batchIdx) const
Returns:

[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getAllNewTokens() const override

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:

[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

inline virtual TensorPtr getNewTokens(SizeType iter = 0) const override

Get tokens generated in one step of last forward pass.

Parameters:

iter – The iteration within [0; maxTokensPerStep) for which to get the tokens

Returns:

[batchSize, beamWidth], tokens generated in iter (per beam), on gpu

inline virtual std::vector<SizeType> getNbSteps() const override
Returns:

[batchSize], the number of generation steps executed on each request

inline virtual TensorPtr getNbFinished() const override
Returns:

[1], number of finished sequences, in pinned host memory

Private Types

using GptDecoderPtr = std::unique_ptr<IGptDecoder>
using DecodingInputPtr = std::unique_ptr<DecodingInput>
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>

Private Functions

CudaEvent postProcessRequest(SizeType batchIdx) const

Gather final beam search results for request batchIdx.

Private Members

std::size_t const mVocabSize
std::size_t const mVocabSizePadded
CudaStreamPtr mStream
BufferManager mBufferManager
TokenPtr mForwardToken
CudaEvent mForwardEvent
std::vector<CudaStreamPtr> mStreams
std::vector<GptDecoderPtr> mDecoders
std::vector<DecodingInputPtr> mDecodingInputs
std::vector<DecodingOutputPtr> mDecodingOutputs
DecodingInputPtr mJointDecodingInput
DecodingOutputPtr mJointDecodingOutput
std::vector<TensorPtr> mDraftTokenIds
std::vector<TensorPtr> mDraftLogits
std::vector<bool> mAcceptByLogits
TensorPtr mNumDraftTokens
TensorPtr mCurandStates
std::vector<SizeType> mNbSteps
std::vector<bool> mFinished
TensorPtr mFinishedSum
std::vector<SizeType> mMaxNewTokens
std::vector<SizeType> mBeamWidths
std::vector<SizeType> mGeneratedTokensPerStep
TensorPtr mFinishedSteps
TensorPtr mDraftProbs
TensorPtr mTargetProbs
SizeType mMaxSequenceLength = {}
SizeType mMaxAttentionWindow = {}
SizeType mSinkTokenLength = {}
SizeType mActualBatchSize = {}
SizeType mMaxTokensPerStep = {}

gptJsonConfig.h

namespace tensorrt_llm
namespace runtime
class GptJsonConfig

Public Functions

inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType tensorParallelism, SizeType pipelineParallelism, GptModelConfig const &modelConfig)
inline GptModelConfig getModelConfig() const
inline std::string const &getName() const
inline std::string const &getVersion() const
inline std::string const &getPrecision() const
inline constexpr SizeType getTensorParallelism() const
inline constexpr SizeType getPipelineParallelism() const
inline constexpr SizeType getWorldSize() const
std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const
inline std::string engineFilename(WorldConfig const &worldConfig) const

Public Static Functions

static GptJsonConfig parse(std::string const &json)
static GptJsonConfig parse(std::istream &json)
static GptJsonConfig parse(std::filesystem::path const &path)

Private Members

std::string const mName
std::string const mVersion
std::string const mPrecision
SizeType const mTensorParallelism
SizeType const mPipelineParallelism
GptModelConfig const mGptModelConfig

gptModelConfig.h

namespace tensorrt_llm
namespace runtime
class GptModelConfig

Public Types

enum class ModelVariant : std::int32_t

Values:

enumerator kGpt
enumerator kGlm

Public Functions

inline explicit GptModelConfig(SizeType vocabSize, SizeType nbLayers, SizeType nbHeads, SizeType hiddenSize, nvinfer1::DataType dtype)
inline constexpr SizeType getVocabSize() const noexcept
inline constexpr SizeType getVocabSizePadded(SizeType worldSize) const noexcept
inline constexpr SizeType getNbLayers(SizeType pipelineParallelism = 1) const
inline constexpr SizeType getNbHeads() const noexcept
inline constexpr SizeType getNbKvHeads() const noexcept
inline constexpr void setNbKvHeads(SizeType nbKvHeads) noexcept
inline constexpr SizeType getHiddenSize() const noexcept
inline constexpr SizeType getSizePerHead() const noexcept
inline constexpr void setSizePerHead(SizeType sizePerHead) noexcept
inline constexpr nvinfer1::DataType getDataType() const noexcept
inline constexpr bool useGptAttentionPlugin() const noexcept
inline constexpr void useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept
inline constexpr bool usePackedInput() const noexcept
inline constexpr void usePackedInput(bool inputPacked) noexcept
inline constexpr bool usePagedKvCache() const noexcept
inline constexpr void usePagedKvCache(bool pagedKvCache) noexcept
inline constexpr SizeType getTokensPerBlock() const noexcept
inline constexpr void setTokensPerBlock(SizeType TokensPerBlock) noexcept
inline constexpr common::QuantMode getQuantMode() const noexcept
inline constexpr void setQuantMode(common::QuantMode QuantMode) noexcept
inline constexpr bool supportsInflightBatching() const noexcept
inline constexpr SizeType getMaxBatchSize() const noexcept
inline constexpr void setMaxBatchSize(SizeType maxBatchSize) noexcept
inline constexpr SizeType getMaxBeamWidth() const noexcept
inline constexpr void setMaxBeamWidth(SizeType maxBeamWidth) noexcept
inline constexpr SizeType getMaxInputLen() const noexcept
inline constexpr void setMaxInputLen(SizeType maxInputLen) noexcept
inline constexpr SizeType getMaxSequenceLen() const noexcept
inline constexpr void setMaxSequenceLen(SizeType maxSequenceLen) noexcept
inline constexpr std::optional<SizeType> getMaxNumTokens() const noexcept
inline constexpr void setMaxNumTokens(std::optional<SizeType> maxNumTokens) noexcept
inline constexpr bool usePromptTuning() const noexcept
inline constexpr SizeType getMaxPromptEmbeddingTableSize() const noexcept
inline constexpr void setMaxPromptEmbeddingTableSize(SizeType maxPromptEmbeddingTableSize) noexcept
inline constexpr bool computeContextLogits() const noexcept
inline constexpr void computeContextLogits(bool computeContextLogits) noexcept
inline constexpr bool computeGenerationLogits() const noexcept
inline constexpr void computeGenerationLogits(bool computeGenerationLogits) noexcept
inline ModelVariant getModelVariant() const
inline void setModelVariant(ModelVariant modelVariant)
inline constexpr bool useCustomAllReduce() const noexcept
inline constexpr void useCustomAllReduce(bool customAllReduce) noexcept
inline constexpr void setMaxDraftLen(SizeType maxDraftLen) noexcept
inline SizeType getMaxDraftLen() const
inline constexpr SizeType getMaxTokensPerStep() const noexcept
inline constexpr void setUseContextFMHAForGeneration(bool useContextFMHAForGeneration) noexcept
inline constexpr bool getContextFMHAForGeneration() const noexcept
inline constexpr void setPagedContextFMHA(bool pagedContextFMHA) noexcept
inline constexpr bool getPagedContextFMHA() const noexcept
inline constexpr bool useLoraPlugin() const noexcept
inline constexpr void useLoraPlugin(bool useLoraPlugin) noexcept
inline std::vector<LoraModule> const &getLoraModules() const noexcept
inline void setLoraModules(std::vector<LoraModule> const &loraModules) noexcept
inline constexpr SizeType getMlpHiddenSize() const noexcept
inline constexpr void setMlpHiddenSize(SizeType mlpHiddenSize) noexcept
inline constexpr SizeType getMaxLoraRank() const noexcept
inline constexpr void setMaxLoraRank(SizeType maxLoraRank) noexcept

Private Members

SizeType mVocabSize
SizeType mNbLayers
SizeType mNbHeads
SizeType mNbKvHeads
SizeType mHiddenSize
SizeType mSizePerHead
nvinfer1::DataType mDataType
bool mUseGptAttentionPlugin
bool mInputPacked
bool mPagedKvCache
SizeType mTokensPerBlock
common::QuantMode mQuantMode
SizeType mMaxBatchSize
SizeType mMaxBeamWidth
SizeType mMaxInputLen
SizeType mMaxSequenceLen
std::optional<SizeType> mMaxNumTokens
bool mComputeContextLogits
bool mComputeGenerationLogits
ModelVariant mModelVariant
bool mUseCustomAllReduce
SizeType mMaxPromptEmbeddingTableSize
SizeType mMaxDraftLen
bool mUseContextFMHAForGeneration
bool mPagedContextFMHA
bool mUseLoraPlugin
std::vector<LoraModule> mLoraModules
SizeType mMlpHiddenSize
SizeType mMaxLoraRank

gptSession.h

namespace tensorrt_llm
namespace batch_manager
namespace kv_cache_manager
namespace runtime
class GptSession

Public Types

using LoggerPtr = std::shared_ptr<nvinfer1::ILogger>

Public Functions

GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
inline GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)
inline GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)
nvinfer1::ILogger &getLogger() const
BufferManager const &getBufferManager() const
inline GptModelConfig const &getModelConfig() const
inline WorldConfig const &getWorldConfig() const
inline int getDevice() const noexcept
nvinfer1::DataType getLogitDataType() const
void generate(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig)

Private Types

using KvCacheManager = batch_manager::kv_cache_manager::KVCacheManager
using KvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig
using TensorPtr = runtime::ITensor::SharedPtr
using TokenGeneratedCallback = std::function<void(SizeType step, bool finished)>

Private Functions

inline bool useCudaGraphs()
void generateBatched(std::vector<GenerationOutput> &microBatchesOutputs, std::vector<GenerationInput> const &microBatchesInputs, SamplingConfig const &samplingConfig, TokenGeneratedCallback const &onTokenGenerated)
void setup(Config const &sessionConfig)
void createContexts()
void createBuffers(SizeType numMicroBatches)
void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches)
void createKvCacheManager(SizeType batchSize, SizeType beamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, KvCacheConfig const &config)
void createCustomAllReduceWorkspace(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)
void executeContextStep(std::vector<GenerationInput> const &generationBatchesInputs, std::vector<SizeType> const &generationBatchesOffsets, KvCacheManager const *kvCacheManager)
SizeType executeGenerationStep(SizeType step, std::vector<GenerationInput> const &microBatchesInputs, std::vector<GenerationOutput> &microBatchesOutputs, std::vector<SizeType> const &microBatchOffsets, KvCacheManager *kvCacheManager, std::vector<bool> &microBatchesFinished)
void decoderStepAsync(SizeType decoderStep, SizeType microBatchId)

Execute decoder on last PP rank, receive decoder output on other PP ranks.

bool shouldStopSync(SizeType batchSize, SizeType beamWidth, SizeType microBatchId)

Synchronize with the decoder and return the shouldStop flag.

void finalize(SizeType microBatchId)

Collect final output ids and log probs on last PP rank and send them to first PP rank.

Receives are asynchronous on host, so synchronization is required before access.

void kvCacheAddSequences(SizeType beamWidth, SizeType microBatchId, SizeType firstBatchIdx)
ITensor::SharedPtr initDecoder(ITensor &outputIds, GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, SizeType microBatchId) const

Populate outputIds and return reference to newTokens tensor.

TokenGeneratedCallback createOnTokenGeneratedCallback(GenerationOutput &outputs)

Private Members

GptModelConfig const mModelConfig
WorldConfig const mWorldConfig
int mDevice = {-1}
std::shared_ptr<NcclCommunicator> mPipelineComm
std::shared_ptr<CudaStream> mCommStream
CudaEvent mCommEvent = {}
ITensor::SharedPtr mCommPtrs
std::vector<std::shared_ptr<IpcMemory>> mIpcMemoryHandles
SizeType mDecoderMaxSequenceLength = {}
SizeType mDecoderMaxAttentionWindow = {}
SizeType mDecoderSinkTokenLength = {}
LoggerPtr mLogger
std::shared_ptr<TllmRuntime> mRuntime
std::shared_ptr<KvCacheManager> mKvCacheManager
MicroBatchConfig mMicroBatchConfig
std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders
std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers
std::vector<CudaEvent> mReceivedEvents
bool mCudaGraphMode = {false}
std::vector<CudaGraphExecutor> mCudaGraphInstances

Friends

friend class batch_manager::TrtGptModelV1
class Config
#include <gptSession.h>

Configuration for session execution and buffer sizes. generate may be called with batch size and beam width smaller than the configured parameters.

maxBatchSize will be divided by the number of micro batches to initialize each batch buffer.

Public Functions

inline Config(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength)

Public Members

SizeType maxBatchSize
SizeType maxBeamWidth
SizeType maxSequenceLength
bool decoderPerRequest = {false}
bool cudaGraphMode = {false}
KvCacheConfig kvCacheConfig = {}
std::optional<SizeType> ctxMicroBatchSize = std::nullopt
std::optional<SizeType> genMicroBatchSize = std::nullopt
class CudaGraphExecutor

Public Functions

CudaGraphExecutor() = default
inline ~CudaGraphExecutor()
inline bool hasInstance()
void clear()
void prepareNextGraph(TllmRuntime const &runtime, SizeType nextContextId)
void launch(CudaStream const &stream)

Private Functions

void create(cudaGraph_t const &graph)
bool update(cudaGraph_t const &graph)
void uploadToStream(CudaStream const &stream)

Private Members

cudaGraphExec_t mInstance
class MicroBatchConfig

Public Functions

inline MicroBatchConfig()
explicit MicroBatchConfig(SizeType maxBatchSize, SizeType pipelineParallelism, std::optional<SizeType> genMicroBatchSize, std::optional<SizeType> ctxMicroBatchSize)
inline constexpr SizeType numCtxPerGen() const
inline constexpr SizeType getGenGraphId(SizeType flipFlopId, SizeType generationBatchId) const

flip-flop between 2 graph instances for each generation batch.

Public Members

SizeType numCtxBatches
SizeType numGenBatches
SizeType ctxBatchSize
SizeType genBatchSize
namespace utils

Functions

std::vector<uint8_t> loadEngine(std::string const &enginePath)

iBuffer.h

template<>
struct MemoryTypeString<MemoryType::kGPU>

Public Static Attributes

static constexpr auto value = "GPU"
template<>
struct MemoryTypeString<MemoryType::kCPU>

Public Static Attributes

static constexpr auto value = "CPU"
template<>
struct MemoryTypeString<MemoryType::kPINNED>

Public Static Attributes

static constexpr auto value = "PINNED"
template<>
struct MemoryTypeString<MemoryType::kUVM>

Public Static Attributes

static constexpr auto value = "UVM"
template<>
struct DataTypeTraits<nvinfer1::DataType::kFLOAT>

Public Types

using type = float

Public Static Attributes

static constexpr char name[] = "float"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kHALF>

Public Types

using type = half

Public Static Attributes

static constexpr char name[] = "half"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT8>

Public Types

using type = std::int8_t

Public Static Attributes

static constexpr char name[] = "int8"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32>

Public Types

using type = std::int32_t

Public Static Attributes

static constexpr char name[] = "int32"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64>

Public Types

using type = std::int64_t

Public Static Attributes

static constexpr char name[] = "int64"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32, true>

Public Types

using type = std::uint32_t

Public Static Attributes

static constexpr char name[] = "uint32"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64, true>

Public Types

using type = std::uint64_t

Public Static Attributes

static constexpr char name[] = "uint64"
static constexpr auto size = sizeof(type)
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned>

Public Types

using type = bool

Public Static Attributes

static constexpr char name[] = "bool"
static constexpr auto size = sizeof(type)
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned>

Public Types

using type = std::uint8_t

Public Static Attributes

static constexpr char name[] = "uint8"
static constexpr auto size = sizeof(type)
template<>
struct TRTDataType<std::int8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8
template<>
struct TRTDataType<std::int32_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32
template<>
struct TRTDataType<std::uint32_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
template<>
struct TRTDataType<std::int64_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64
template<>
struct TRTDataType<std::uint64_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
template<>
struct TRTDataType<std::uint8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8
namespace tensorrt_llm
namespace runtime

Typedefs

template<typename T>
using PointerElementType = typename std::remove_reference_t<T>::element_type

Enums

enum class MemoryType : std::int32_t

Values:

enumerator kGPU
enumerator kCPU
enumerator kPINNED
enumerator kUVM

Functions

template<typename T>
std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::shared_ptr<T> const &ptr) noexcept
template<typename T, typename D>
std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::unique_ptr<T, D> &&ptr) noexcept
template<typename T>
T const *bufferCast(IBuffer const &buffer)
template<typename T>
T *bufferCast(IBuffer &buffer)
std::ostream &operator<<(std::ostream &output, IBuffer const &buffer)

Utility function to print a buffer.

template<MemoryType T>
struct MemoryTypeString
template<> kGPU >

Public Static Attributes

static constexpr auto value = "GPU"
template<> kCPU >

Public Static Attributes

static constexpr auto value = "CPU"
template<> kPINNED >

Public Static Attributes

static constexpr auto value = "PINNED"
template<> kUVM >

Public Static Attributes

static constexpr auto value = "UVM"
template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false>
struct DataTypeTraits
#include <iBuffer.h>

For converting a TensorRT data type to a C++ data type.

template<> kFLOAT >

Public Types

using type = float

Public Static Attributes

static constexpr char name[] = "float"
static constexpr auto size = sizeof(type)
template<> kHALF >

Public Types

using type = half

Public Static Attributes

static constexpr char name[] = "half"
static constexpr auto size = sizeof(type)
template<> kINT8 >

Public Types

using type = std::int8_t

Public Static Attributes

static constexpr char name[] = "int8"
static constexpr auto size = sizeof(type)
template<> kINT32 >

Public Types

using type = std::int32_t

Public Static Attributes

static constexpr char name[] = "int32"
static constexpr auto size = sizeof(type)
template<> kINT64 >

Public Types

using type = std::int64_t

Public Static Attributes

static constexpr char name[] = "int64"
static constexpr auto size = sizeof(type)
template<> kINT32, true >

Public Types

using type = std::uint32_t

Public Static Attributes

static constexpr char name[] = "uint32"
static constexpr auto size = sizeof(type)
template<> kINT64, true >

Public Types

using type = std::uint64_t

Public Static Attributes

static constexpr char name[] = "uint64"
static constexpr auto size = sizeof(type)
template<bool kUnsigned> kBOOL, kUnsigned >

Public Types

using type = bool

Public Static Attributes

static constexpr char name[] = "bool"
static constexpr auto size = sizeof(type)
template<bool kUnsigned> kUINT8, kUnsigned >

Public Types

using type = std::uint8_t

Public Static Attributes

static constexpr char name[] = "uint8"
static constexpr auto size = sizeof(type)
template<nvinfer1::DataType kDataType, bool kUnsigned>
struct DataTypeTraits<kDataType, kUnsigned, true>

Public Types

using type = typename DataTypeTraits<kDataType, kUnsigned, false>::type*

Public Static Attributes

static constexpr char name[] = "*"
static constexpr auto size = sizeof(type)
class BufferDataType
#include <iBuffer.h>

A wrapper around nvinfer1::DataType that provides a support for pointer types.

Public Functions

inline constexpr BufferDataType(nvinfer1::DataType dataType, bool _unsigned = false, bool pointer = false)
inline constexpr operator nvinfer1::DataType() const noexcept
inline constexpr nvinfer1::DataType getDataType() const noexcept
inline constexpr bool isPointer() const noexcept
inline constexpr bool isUnsigned() const
inline constexpr std::size_t getSize() const noexcept

Public Static Attributes

static constexpr auto kTrtPointerType = nvinfer1::DataType::kINT64

Private Members

nvinfer1::DataType mDataType
bool mUnsigned
bool mPointer
template<typename T, bool = false>
struct TRTDataType
#include <iBuffer.h>

For converting a C++ data type to a TensorRT data type.

template<>
struct TRTDataType<float>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kFLOAT
template<>
struct TRTDataType<half>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kHALF
template<> int8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8
template<> int32_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32
template<> uint32_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
template<> int64_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64
template<> uint64_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
template<>
struct TRTDataType<bool>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kBOOL
template<> uint8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8
template<>
struct TRTDataType<void*>

Public Static Attributes

static constexpr auto value = BufferDataType::kTrtPointerType
template<typename T>
struct TRTDataType<T*>

Public Static Attributes

static constexpr auto value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}

Private Static Attributes

static constexpr auto kUnderlyingType = BufferDataType{TRTDataType<T, false>::value}
class IBuffer

Subclassed by tensorrt_llm::runtime::ITensor

Public Types

using UniquePtr = std::unique_ptr<IBuffer>
using SharedPtr = std::shared_ptr<IBuffer>
using UniqueConstPtr = std::unique_ptr<IBuffer const>
using SharedConstPtr = std::shared_ptr<IBuffer const>
using DataType = nvinfer1::DataType

Public Functions

virtual void *data() = 0

Returns a pointer to underlying array.

virtual void const *data() const = 0

Returns a pointer to underlying array.

inline virtual void *data(std::size_t index)

Returns a pointer to the underlying array at a given element index.

inline virtual void const *data(std::size_t index) const

Returns a pointer to the underlying array at a given element index.

virtual std::size_t getSize() const = 0

Returns the size (in number of elements) of the buffer.

inline virtual std::size_t getSizeInBytes() const

Returns the size (in bytes) of the buffer.

virtual std::size_t getCapacity() const = 0

Returns the capacity of the buffer.

virtual DataType getDataType() const = 0

Returns the data type of the buffer.

virtual char const *getDataTypeName() const
virtual MemoryType getMemoryType() const = 0

Returns the memory type of the buffer.

virtual char const *getMemoryTypeName() const
virtual void resize(std::size_t newSize) = 0

Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

virtual void release() = 0

Releases the buffer. It will be reset to nullptr.

virtual ~IBuffer() = default
IBuffer(IBuffer const&) = delete

Not allowed to copy.

IBuffer &operator=(IBuffer const&) = delete

Not allowed to copy.

Public Static Functions

static UniquePtr slice(SharedPtr buffer, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying