Runtime
bufferManager.h
-
namespace tensorrt_llm
-
namespace runtime
-
class BufferManager
- #include <bufferManager.h>
A helper class for managing memory on host and device.
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
using CudaMemPoolPtr = std::shared_ptr<CudaMemPool>
Public Functions
-
explicit BufferManager(CudaStreamPtr stream, bool trimPool = false)
Construct a BufferManager.
- Parameters:
cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
-
inline ~BufferManager()
Destructor.
-
IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
IBuffer
of the given size on the GPU, using cudaMallocAsync.
-
ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
ITensor
of the given dimensions on the GPU, using cudaMallocAsync.
-
IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
IBuffer
of the given size and memory type.
-
ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
ITensor
of the given dimensions and memory type.
-
inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const
Create an empty
IBuffer
of the given memory type. It may be resized later.
-
inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const
Create an empty
ITensor
of the given memory type. It may be reshaped later.
-
void copy(void const *src, IBuffer &dst, MemoryType srcType) const
Copy
src
todst
.
-
void copy(IBuffer const &src, void *dst, MemoryType dstType) const
Copy
src
todst
.
-
IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const
Copy
src
into a newIBuffer
with a potentially different memory type.
-
ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const
Copy
src
into a newITensor
with a potentially different memory type.
-
template<typename T>
inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const Copy
src
into a newIBuffer
with a potentially different memory type.
-
template<typename T>
inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const Copy
src
into a newITensor
with a potentially different memory type.
-
template<typename T>
inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const Copy
src
into a newITensor
with a potentially different memory type.
-
CudaStream const &getStream() const
Get the underlying cuda stream.
-
std::size_t memoryPoolReserved() const
The current size of the memory reserved by the memory pool.
-
std::size_t memoryPoolUsed() const
The current size of the memory used by the memory pool.
-
std::size_t memoryPoolFree() const
The current size of the memory free in the memory pool.
-
void memoryPoolTrimTo(std::size_t size)
Try to trim the memory reserved by the pool to
size
bytes. This synchronizes implicitly with the stream.
Public Static Functions
-
static IBufferPtr gpuSync(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
IBuffer
of the given size on the GPU, using cudaMalloc.
-
static ITensorPtr gpuSync(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
ITensor
of the given dimensions on the GPU, using cudaMalloc.
-
static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
IBuffer
of the given size on the CPU.
-
static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
ITensor
of the given dimensions on the CPU.
-
static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
IBuffer
of the given size on the CPU.
-
static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
ITensor
of the given dimensions on the CPU.
-
static IBufferPtr pinnedPool(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
IBuffer
of the given size on the CPU in the default memory pool.
-
static ITensorPtr pinnedPool(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
ITensor
of the given dimensions on the CPU in the default memory pool.
-
static IBufferPtr managed(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
IBuffer
of the given size in UVM.
-
static ITensorPtr managed(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
ITensor
of the given dimensions in UVM.
Friends
- friend class ::BufferManagerTest
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
class BufferManager
-
namespace runtime
common.h
Defines
-
FMT_DIM
-
namespace tensorrt_llm
-
namespace runtime
Typedefs
-
using SizeType32 = std::int32_t
-
using SizeType64 = std::int64_t
-
using TokenIdType = std::int32_t
-
using LoraTaskIdType = std::uint64_t
-
using TokenExtraIdType = std::uint64_t
-
using VecTokenExtraIds = std::vector<TokenExtraIdType>
-
using VecUniqueTokens = std::vector<UniqueToken>
-
struct UniqueToken
Public Functions
-
inline bool operator==(UniqueToken const &other) const noexcept
-
inline bool operator==(UniqueToken const &other) const noexcept
-
using SizeType32 = std::int32_t
-
namespace runtime
cudaEvent.h
-
namespace tensorrt_llm
-
namespace runtime
-
class CudaEvent
Public Types
-
using pointer = cudaEvent_t
Public Functions
-
inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)
Creates a new cuda event. The event will be destroyed in the destructor.
- Parameters:
flags – Flags for event creation. By default, event timing is disabled.
-
inline explicit CudaEvent(pointer event, bool ownsEvent = true)
Pass an existing cuda event to this object.
- Parameters:
event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.
-
inline void synchronize() const
Synchronizes the event.
Private Types
-
using EventPtr = std::unique_ptr<element_type, Deleter>
-
using pointer = cudaEvent_t
-
class CudaEvent
-
namespace runtime
cudaStream.h
-
namespace tensorrt_llm
-
namespace runtime
-
class CudaStream
Public Functions
-
inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)
Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
- Parameters:
flags – Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See ::cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
-
inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)
Pass an existing cuda stream to this object.
- Parameters:
stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.
-
inline explicit CudaStream(cudaStream_t stream)
Construct with an existing cuda stream or the default stream by passing nullptr.
-
inline int getDevice() const
Returns the device on which the stream was created.
-
inline cudaStream_t get() const
Returns the stream associated with this object.
-
inline void synchronize() const
Synchronizes the stream.
Friends
- friend class CudaStreamBindings
-
inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)
-
class CudaStream
-
namespace runtime
decodingInput.h
-
namespace tensorrt_llm
-
namespace runtime
-
class DecodingInput
- #include <decodingInput.h>
Represents the inputs to the decoder.
This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such.
Public Functions
-
inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize, TensorConstPtr logits, TensorPtr endIds, TensorConstPtr batchSlots)
Public Members
-
SizeType32 step
The index of the decoding step we are on. Only used in Python runtime.
-
SizeType32 maxLength
The maximum number of tokens to decode.
-
SizeType32 maxAttentionWindow
The maximum length of the attention window to consider while decoding.
-
SizeType32 sinkTokenLength
the number of tokens to use as attention sinks, as described there: https://arxiv.org/html/2309.17453v3
-
SizeType32 batchSize
The number of samples in the batch.
-
SizeType32 maxStopWordsLen
The maximum value in the
stopWordsLens
tensor.
-
SizeType32 maxBadWordsLen
The maximum value in the
badWordsLens
tensor.
-
TensorConstPtr logits
[batchSize, beamWidth, vocabSizePadded], on gpu. Logits are are a probability distribution over the vocabulary, the output of the model.
-
TensorConstPtr endIds
[batchSize * beamWidth], on gpu
-
TensorConstPtr batchSlots
[batchSize], address map of the linear batch id to to the seq slots, int32_t, pinned
-
TensorConstPtr finishReasons
[batchSize, beamWidth], finished states at current iteration. If true for some request, the decoding step of it is skipped, on gpu
-
TensorConstPtr sequenceLimitLength
[batchSize], on gpu. The maximum sequence length for each sequence in the batch.
-
TensorConstPtr embeddingBias
[batchSize, vocabSizePadded], on gpu
-
TensorConstPtr lengths
[batchSize, beamWidth], on gpu
-
TensorConstPtr badWordsPtrs
[batchSize][2, badWordsLength], on gpu
-
TensorConstPtr badWordsLens
[batchSize], on gpu
-
TensorConstPtr stopWordsPtrs
[batchSize][2, stopWordsLength], pinned
-
TensorConstPtr stopWordsLens
[batchSize], pinned
-
TensorConstPtr noRepeatNgramSize
[batchSize], on gpu
-
TensorPtr cacheIndirection
[batchSize, beamWidth, maxSeqLen] - the k/v cache index for beam search, on gpu
-
std::optional<MedusaInputs> medusaInputs
-
std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs
-
std::optional<LookaheadInputs> lookaheadInputs
-
std::optional<ExternalDraftTokensInputs> externalDraftTokensInputs
-
std::optional<EagleInputs> eagleInputs
-
struct EagleInputs
Public Functions
-
inline EagleInputs(TensorConstPtr nextDraftTokens, TensorConstPtr nextDraftLens, TensorConstPtr nextDraftPaths, TensorConstPtr lastDraftTokens, TensorConstPtr lastDraftLens, TensorConstPtr lastDraftPaths, TensorConstPtr acceptedTokens, TensorConstPtr acceptedLens, TensorConstPtr acceptedPathIds, TensorConstPtr chunkedContextNextTokens, TensorConstPtr seqSlots)
Public Members
-
TensorConstPtr nextDraftTokens
[batchSize, maxDecodingDraftTokens]
-
TensorConstPtr nextDraftLens
[batchSize]
-
TensorConstPtr nextDraftPaths
[batchSize, maxDecodingTokens, maxPathLen]
-
TensorConstPtr lastDraftTokens
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr lastDraftLens
[batchSize]
-
TensorConstPtr lastDraftPaths
[batchSize, maxDecodingTokens, maxPathLen]
-
TensorConstPtr acceptedTokens
[batchSize, maxPathLen]
-
TensorConstPtr acceptedLens
[batchSize]
-
TensorConstPtr acceptedPathIds
[batchSize]
-
TensorConstPtr chunkedContextNextTokens
[batchSize]
-
TensorConstPtr seqSlots
[batchSize]
-
inline EagleInputs(TensorConstPtr nextDraftTokens, TensorConstPtr nextDraftLens, TensorConstPtr nextDraftPaths, TensorConstPtr lastDraftTokens, TensorConstPtr lastDraftLens, TensorConstPtr lastDraftPaths, TensorConstPtr acceptedTokens, TensorConstPtr acceptedLens, TensorConstPtr acceptedPathIds, TensorConstPtr chunkedContextNextTokens, TensorConstPtr seqSlots)
-
class ExplicitDraftTokensInputs
Public Members
-
TensorConstPtr nextDraftTokens
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr nextFlatTokens
[batchSize * maxDecodingTokens]
-
TensorConstPtr nextDraftIndices
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr nextDraftProbs
[batchSize, maxNumPaths, maxDraftPathLen, vocabSize]
-
TensorConstPtr lastDraftTokens
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr lastDraftIndices
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr masks
[batchSize, maxDecodingTokens, maxDecodingTokens], bool
-
TensorConstPtr packedPositionIds
[batchSize * maxDecodingTokens]
-
TensorConstPtr bestPathLengths
[batchSize]
-
TensorConstPtr bestPathIndices
[batchSize]
-
TensorConstPtr nextGenerationLengths
[batchSize]
-
TensorConstPtr lastPositionIdsBase
[batchSize]
-
TensorConstPtr lastGenerationLengths
[batchSize]
-
TensorConstPtr maxGenLengthDevice
[1]
-
TensorConstPtr seqSlots
[batchSize]
-
TensorConstPtr nextDraftTokens
-
class ExternalDraftTokensInputs
-
struct LookaheadInputs
-
class MedusaInputs
Public Members
-
TensorConstPtr medusaPaths
[batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu
-
TensorConstPtr medusaTreeIds
[batchSize, maxTokensPerStep], on gpu
-
std::vector<std::vector<TensorPtr>> medusaLogits
[batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu
-
TensorConstPtr medusaTargetTokensPerStep
[batchSize], on gpu
-
TensorConstPtr medusaPaths
-
inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize, TensorConstPtr logits, TensorPtr endIds, TensorConstPtr batchSlots)
-
class DecodingInput
-
namespace runtime
decodingOutput.h
-
namespace tensorrt_llm
-
namespace batch_manager
-
namespace runtime
-
class DecodingOutput
-
Public Members
-
BeamHypotheses beamHypotheses
-
std::optional<SpeculativeDecodingOutputs> speculativeDecodingOutputs
-
std::optional<ExplicitDraftTokensBuffers::Inputs> explicitDraftTokensBuffers
-
std::optional<LookaheadDecodingBuffers> lookaheadOutputs
-
std::optional<EagleBuffers::Inputs> eagleBuffers
Public Static Attributes
-
static float constexpr kNegativeInfinity = -1e20f
-
class BeamHypotheses
Public Functions
-
void empty(BufferManager &manager)
-
void reshape(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSequenceLength)
-
void release()
-
void init(BufferManager &manager, TokenIdType endId)
-
BeamHypotheses slice(SizeType32 batchIndex, SizeType32 size) const
-
void empty(BufferManager &manager)
-
class SpeculativeDecodingOutputs
-
BeamHypotheses beamHypotheses
-
class DecodingOutput
-
namespace batch_manager
eagleBuffers.h
-
namespace tensorrt_llm
-
namespace batch_manager
-
namespace runtime
-
class EagleBuffers
Public Types
-
using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>
-
using RequestVector = std::vector<LlmRequestPtr>
-
using SizeType32 = runtime::SizeType32
-
using TensorMap = runtime::StringPtrMap<runtime::ITensor>
Public Functions
-
EagleBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, executor::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)
-
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const &modelConfig)
-
void setFromInputs(RequestVector const &contextRequests, RequestVector const &genRequests, runtime::ITensor const &requestTypes, ITensor const &seqSlots, EagleBuffers::Inputs const &decoderBuffers, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const
-
void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const
Public Members
-
class tensorrt_llm::runtime::EagleBuffers::EngineOutputs engineOutputs
Private Functions
-
template<typename T>
void setFromInputs(RequestVector const &contextRequests, RequestVector const &genRequests, SizeType32 vocabSizePadded, ITensor const &seqSlots, EagleBuffers::Inputs const &draftBuffers, runtime::EagleModule const &eagleModule, runtime::BufferManager const &manager) const
Private Members
-
std::size_t scanTempStorageBytes = {0}
-
std::size_t reduceTempStorageBytes = {0}
-
float mDefaultPosteriorThreshold = {0.09f}
-
bool mDoGreedySampling = {true}
-
class EngineOutputs
Public Members
-
class Inputs
Public Functions
-
void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)
Public Members
-
TensorPtr randomDataValidation
[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]
-
TensorPtr draftTokens
[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-
TensorPtr draftPaths
[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
-
TensorPtr specDecodingPackedMasks
[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
-
void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)
-
using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>
-
class EagleBuffers
-
namespace batch_manager
explicitDraftTokensBuffers.h
-
namespace tensorrt_llm
-
namespace runtime
-
class ExplicitDraftTokensBuffers
Public Types
-
using SizeType32 = runtime::SizeType32
-
using TensorMap = runtime::StringPtrMap<runtime::ITensor>
Public Functions
-
ExplicitDraftTokensBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, executor::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)
-
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const &modelConfig)
-
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const &requestTypes, ITensor const &seqSlots, ExplicitDraftTokensBuffers::Inputs const &decoderBuffers, ITensor const &contextPositionIds, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const
-
void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const
Public Members
-
tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs engineInputs
-
class tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs engineOutputs
-
std::size_t scanTempStorageBytes = {0}
Private Functions
-
template<typename T>
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 vocabSizePadded, ITensor const &seqSlots, ExplicitDraftTokensBuffers::Inputs const &draftBuffers, ITensor const &contextPositionIds, runtime::ExplicitDraftTokensModule const &explicitDraftTokensModule, runtime::CudaStream const &stream) const
-
class EngineInputs : public tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs
-
class EngineOutputs
Public Members
-
class Inputs
Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs
Public Functions
-
void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)
Public Members
-
TensorPtr randomDataValidation
[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]
-
TensorPtr draftTokens
[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
-
TensorPtr draftIndices
[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
-
TensorPtr draftProbs
[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]
-
void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)
-
using SizeType32 = runtime::SizeType32
-
class ExplicitDraftTokensBuffers
-
namespace runtime
generationInput.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GenerationInput : public tensorrt_llm::runtime::GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
Public Types
-
using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
Public Functions
-
inline explicit GenerationInput(SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)
-
using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
-
template<typename TTensor, typename PromptTuningParams>
class GenericGenerationInput - #include <generationInput.h>
endId
, is the token ID that marks the end of the input sequence (akaEOS
or end-of-sequence). It’s50,256
for the GPT2 model which has a vocabulary of50,257
tokens, for example,padId
, is the token ID that is used for padding (i.e. fills in the slots that are at an index greater-or-equal to the input length for padded sequences). It can be set to the same value asendId
,ids
, is the tensor of input IDs. That tensor must be allocated on the GPU. When the input tensor is padded, the shape ofids
is[batchSize, maxInputLength]
, wherebatchSize
andmaxInputLength
must respect the maximum sizes insessionConfig
passed to theGptSession
constructor. When the input is packed, the shape ofids
is[numTokens]
, wherenumTokens
is the sum of the lengths of the different sequences in the batch,lengths
, is the tensor of input sequence lengths. That tensor must be allocated on the GPU and containbatchSize
values,packed
, indicates if theids
tensor is packed or padded. In this release, that flag must match the value passed to the constructor through the instance of theModelConfig
class. In a future release, the session may be made more flexible and automatically pad or pack the input,
embeddingBiasOpt
, is a tensor of floating-point values on the GPU that contains the bias to add to the logits during sampling (after the projection from hidden states to logits as the last step of the model). This tensor must havevocabSize
elements (as defined in themodelConfig
argument passed to the constructor),badWordsList
, is a tensor of integers on the GPU that encodes the list of words that have to be banned from generated sequences. Its shape is[2, badWordsLength]
, as explained below, or[batchSize, 2, badWordsLength]
when there is a different list for each sequence in the batch,stopWordsList
, is a tensor of integers on the GPU that encodes the list of words that trigger the end of the generation for a sequence. Its shape is[2, stopWordsLength]
, as explained below, or[batchSize, 2, stopWordsLength]
when there is a different list for each sequence in the batch,maxNewTokens
, is the maximum number of tokens to generate.
The
badWordsList
andstopWordsList
tensors have the same shape[2, length]
. Let’s consider an example with three words to describe the representation of those lists. The first word contains tokens[5, 7, 3]
, the second one contains[9, 2]
and the third one is composed of tokens[6, 2, 4, 1]
. In total, there are 9 tokens. That’s the length. The shape of the tensor is[2, 9]
. The first row of the tensor must contain the 9 token IDs and the second row must store the inclusive prefix-sum of the word lengths as shown on the following diagram:0 3 5 9 | | | | V V V V [ 5, 7, 3, 9, 2, 6, 2, 4, 1] [ 3, 5, 9, -1, -1, -1, -1, -1, -1]
In case all the words are made of a single token, the inner-most dimension of the tensor must be increased by 1 (i.e. the length for 4 words, each made of a single token, must be 5 instead of 4 — the shape is
[2, 5]
).Public Functions
-
inline explicit GenericGenerationInput(SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)
Public Members
-
SizeType32 endId
-
SizeType32 padId
-
bool packed
-
std::optional<SizeType32> maxNewTokens
-
PromptTuningParams promptTuningParams
-
class GenerationInput : public tensorrt_llm::runtime::GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
-
namespace runtime
generationOutput.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<ITensor::SharedPtr>
Public Types
-
using Base = GenericGenerationOutput<ITensor::SharedPtr>
-
using Base = GenericGenerationOutput<ITensor::SharedPtr>
-
template<typename TTensor>
class GenericGenerationOutput - #include <generationOutput.h>
ids
, is a tensor that contains the output token IDs. Its shape is[batchSize, beamWidth, maxSeqLength]
wheremaxSeqLength
is the sum ofmaxInputLength
andmaxNewTokens
. After generation, it contains, for each sequence, a copy of the input tokens followed by the output tokens. When a sequence is shorter thanmaxSeqLength
, padding tokens are added at the end of the sequence.
Note that the shape of that tensor is different in this version of TensorRT-LLM from its shape in previous versions where it was .
logProbs
, is a tensor of floating-point values on the GPU to store the log-prob of the generated tokens. Its shape is[maxNewTokens, batchSize, beamWidth]
. Its shape will likely change in a future release to match the shape of the outputids
tensor.contextLogits
, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the context. Its shape is[batchSize, maxSequenceLength, vocabSizePadded]
. If useremove_input_padding
, its shape is[packedSize, vocabSizePadded]
. This buffer will only be filled in if the TensorRT engine was built with thegather_context_logits
orgather_all_token_logits
parameter enabled.After inference is complete, you can get the context logits in
GenerationOutput.contextLogits
, these are variables on the GPU. For specific acquisition methods, please refer to the example of gptSessionBenchmark.cpp.It is important to point out that enabling the computation may have an impact on performance (the language modeling head (LM head) has to perform a matrix multiplication on all the context tokens instead of a just the last one).
generationLogits
, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the generation. Its shape is[batchSize, beamWidth, maxOutputLen, vocabSizePadded]
. This buffer will only be filled in if the TensorRT engine was built with thegather_generation_logits
orgather_all_token_logits
parameter enabled.Generation logits can also be obtained through
GenerationOutput.generationLogits
after inference is completed.onTokenGenerated
, is a callback function invoked in the generation loop to pass newly generated tokens to the caller while the loop continues to execute. An implementation of that callback must accept the outputids
tensor, the generationstep
and a boolean flag that indicates if the generation is complete.
Public Types
-
using Callback = std::function<void(TensorPtr const &ids, SizeType32 step, bool finished)>
-
class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<ITensor::SharedPtr>
-
namespace runtime
gptDecoder.h
-
namespace tensorrt_llm
-
namespace layers
-
namespace runtime
Functions
-
inline runtime::ITensor::SharedConstPtr getDefaultBatchSlots(runtime::SizeType32 batchSize)
Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.
-
template<typename T>
class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder Public Types
-
using CudaStreamPtr = BufferManager::CudaStreamPtr
Public Functions
-
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, TensorConstPtr const &batchSlots, std::optional<DecodingOutput> const &output = std::nullopt, std::optional<std::vector<decoder_batch::Request> const> const &requests = std::nullopt) override
-
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) override
-
virtual void forwardSync(DecodingOutput &output, DecodingInput const &input) override
-
inline virtual SamplingConfig const &getSamplingConfig() override
Private Members
-
std::shared_ptr<BufferManager> mManager
-
std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer
-
std::shared_ptr<tensorrt_llm::runtime::DecodingLayerWorkspace> mDecodingLayerWorkspace
-
SamplingConfig mSamplingConfig
-
size_t mMaxBatchSize
-
executor::DecodingMode mDecodingMode
-
using CudaStreamPtr = BufferManager::CudaStreamPtr
-
class IGptDecoder
Subclassed by tensorrt_llm::runtime::GptDecoder< T >
Public Types
-
using TensorConstPtr = runtime::ITensor::SharedConstPtr
Public Functions
-
virtual ~IGptDecoder() = default
-
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, TensorConstPtr const &batchSlots, std::optional<DecodingOutput> const &output = std::nullopt, std::optional<std::vector<decoder_batch::Request> const> const &requests = std::nullopt) = 0
-
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) = 0
-
virtual void forwardSync(DecodingOutput &output, DecodingInput const &input) = 0
-
virtual SamplingConfig const &getSamplingConfig() = 0
Public Static Functions
-
using TensorConstPtr = runtime::ITensor::SharedConstPtr
-
inline runtime::ITensor::SharedConstPtr getDefaultBatchSlots(runtime::SizeType32 batchSize)
-
namespace layers
gptDecoderBatched.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched
- #include <gptDecoderBatched.h>
GPT decoder class with support for in-flight batching.
Public Functions
-
GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream, SpeculativeDecodingMode const &speculativeDecodingMode, nvinfer1::DataType dtype)
-
virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) override
Setup the decoder before calling
forward()
-
virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) override
Setup buffers for ExplicitDraftTokens decoding.
-
virtual void setupEagle(EagleBuffers::Inputs eagleBuffers) override
Setup buffers for Eagle decoding.
-
virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) override
Setup buffers for Lookahead decoding.
-
virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig) override
Initialize the decoder with new batch of inputs.
-
virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs, ModelConfig const &modelConfig) override
Initialize batched decoder at seqSlots with a new
requests
.
-
virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override
Run one step for all requests without blocking the host process and return the token for synchronization.
-
virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent) override
Wait for the call to
forwardAsync
associated with a token to complete.
-
virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent, decoder_batch::Output &output, decoder_batch::Input const &input) override
Call decoder forwardSync and wait for the call to
forwardAsync
associated with a token to complete.
-
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override
Run one step for all requests without blocking the host thread.
-
virtual void forwardSync() override
Wait for the last call to
forwardAsync
to complete.
-
inline virtual std::vector<bool> getFinished() const override
- Returns:
[batchSize], indicators of finished requests
-
inline virtual TensorPtr getFinishReasons() const override
- Returns:
[batchSize, beamWidth], FinishedState value, on gpu
-
inline virtual TensorPtr getIds(SizeType32 batchIdx) const override
- Parameters:
batchIdx – index of the batch
- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
batchIdx
, on gpu. In case of beam search, contains the ungathered data.
-
inline virtual TensorPtr getIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data.
-
inline virtual TensorPtr getGatheredIds(SizeType32 batchIdx) const override
- Parameters:
batchIdx – index of the batch
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request
batchIdx
, on gpu.
-
inline virtual TensorPtr getGatheredIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu
-
virtual CudaEvent finalize(SizeType32 batchSlot, SamplingConfig const &samplingConfig, bool streaming) const override
Gather final beam search results for request
batchSlot
. Result will only be available after event returned.
-
virtual void finalize(SamplingConfig const &samplingConfig) const override
Gather final beam search results for all requests.
-
inline virtual TensorPtr getParentIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
-
inline virtual TensorPtr getCumLogProbs() const override
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
inline virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const override
- Returns:
[maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
inline virtual TensorPtr getLogProbs() const override
- Returns:
[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
inline virtual TensorPtr getLogProbs(SizeType32 batchIdx) const override
- Returns:
[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
inline virtual TensorPtr getAllNewTokens() const override
Get maxTokensPerStep tokens generated in the last forward pass.
- Returns:
[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
-
inline virtual TensorPtr getNewTokens(SizeType32 iter = 0) const override
Get tokens generated in one step of last forward pass.
- Parameters:
iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
- Returns:
[batchSize, beamWidth], tokens generated in
iter
(per beam), on gpu
-
inline virtual std::vector<SizeType32> getNbSteps() const override
- Returns:
[batchSize], the number of generation steps executed on each request
-
inline virtual TensorPtr getNbFinished() const override
- Returns:
[1], number of finished sequences, in pinned host memory
-
inline virtual TensorPtr getNextDraftTokens() const override
- Returns:
[batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu
-
inline virtual TensorPtr getPrevDraftTokensLengths() const override
- Returns:
[batchSize], predicted draft tokens lengths for previous step, on gpu
-
inline virtual TensorPtr getNextDraftTokensLengths() const override
- Returns:
[batchSize], predicted draft tokens lengths for next step, on gpu
-
inline virtual TensorPtr getAcceptedLengthsCumSum() const override
- Returns:
[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu
-
inline virtual TensorPtr getAcceptedPackedPaths() const override
- Returns:
[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
-
inline virtual executor::DecodingMode getDecodingMode() const override
Private Types
-
using GptDecoderPtr = std::unique_ptr<IGptDecoder>
-
using DecodingInputPtr = std::unique_ptr<DecodingInput>
-
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>
Private Functions
-
CudaEvent postProcessRequest(SizeType32 batchIdx, SamplingConfig const &samplingConfig, bool streaming) const
Gather final beam search results for request
batchIdx
.
-
void newRequest(SizeType32 batchSlot, decoder_batch::Request const &request, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig)
Initialize the decoder at
batchSlot
with a newrequest
.
-
void allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype)
Allocate buffers for speculative decoding.
-
void setupSpeculativeDecoding(ModelConfig const &modelConfig)
Setup buffers for speculative decoding.
-
void setupLookahead(ModelConfig const &modelConfig)
Setup buffers for lookahead decoding.
-
void newRequestSpeculativeDecoding(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig)
Setups decoder internal tensors for new speculative decoding request.
-
void newRequestDraftTokensExternal(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig)
Setups decoder internal tensors for new request in Draft model Sps mode.
-
void newRequestMedusa(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Medusa request.
-
void newRequestLookahead(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Lookahead request.
-
void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Explicit draft tokens request.
-
void newRequestEagle(SizeType32 batchIdx, decoder_batch::Request const &request, ModelConfig const &modelConfig)
Setups decoder internal tensors for new Eagle request.
-
void updateFinished(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent)
Updates finished state on host for all active requests.
-
void setExplicitDraftTokensInputs(decoder_batch::Input const &input)
Sets inputs for explicit draft tokens.
-
void setEagleInputs(decoder_batch::Input const &input)
Sets inputs for eagle decoding.
-
void forwardDispatch(decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType)
Calls decoders for tokens per engine step.
-
void forwardDecoder(SizeType32 step, decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType)
Calls decoder for whole batch.
Private Members
-
std::size_t const mVocabSize
-
std::size_t const mVocabSizePadded
-
CudaStreamPtr mRuntimeStream
-
CudaStreamPtr mDecoderStream
-
BufferManager mBufferManager
-
DecoderFinishedEventPtr mDecoderFinishEvent
-
GptDecoderPtr mDecoder
-
DecodingInputPtr mJointDecodingInput
-
DecodingOutputPtr mJointDecodingOutput
-
std::vector<SizeType32> mNbSteps
-
std::vector<bool> mFinished
-
std::vector<SizeType32> mMaxNewTokens
-
std::vector<SizeType32> mBeamWidths
-
std::vector<SizeType32> mNumDecodingEngineTokens
-
SizeType32 mMaxSequenceLength = {}
-
SizeType32 mMaxAttentionWindow = {}
-
SizeType32 mSinkTokenLength = {}
-
SizeType32 mActualBatchSize = {}
-
SizeType32 mMaxDecodingDecoderTokens = {}
-
SizeType32 mMaxDecodingEngineTokens = {}
-
SpeculativeDecodingMode mSpeculativeDecodingMode
-
executor::DecodingMode mDecodingMode = {executor::DecodingMode::Auto()}
-
std::shared_ptr<DecodingOutput::BeamHypotheses> mOutputBeamHypotheses = {nullptr}
-
DecodingOutput::TensorPtr mCumLogProbsTmp
-
SizeType32 mNumSMs
-
GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream, SpeculativeDecodingMode const &speculativeDecodingMode, nvinfer1::DataType dtype)
-
class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched
-
namespace runtime
gptJsonConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GptJsonConfig
Public Functions
-
inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism, SizeType32 gpusPerNode, ModelConfig modelConfig, std::optional<RuntimeDefaults> runtimeDefaults = std::nullopt)
-
inline ModelConfig const &getModelConfig() const
-
inline ModelConfig &getModelConfigMutable()
-
inline std::string const &getName() const
-
inline std::string const &getVersion() const
-
inline std::string const &getPrecision() const
-
inline SizeType32 constexpr getTensorParallelism() const
-
inline SizeType32 constexpr getPipelineParallelism() const
-
inline SizeType32 constexpr getContextParallelism() const
-
inline SizeType32 constexpr getGpusPerNode() const
-
inline SizeType32 constexpr getWorldSize() const
-
inline std::optional<RuntimeDefaults> getRuntimeDefaults() const
-
std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const
-
inline std::string engineFilename(WorldConfig const &worldConfig) const
Public Static Functions
-
static GptJsonConfig parse(std::string const &json)
-
static GptJsonConfig parse(std::istream &json)
-
static GptJsonConfig parse(std::filesystem::path const &path)
Private Members
-
std::string const mName
-
std::string const mVersion
-
std::string const mPrecision
-
SizeType32 const mTensorParallelism
-
SizeType32 const mPipelineParallelism
-
SizeType32 const mContextParallelism
-
SizeType32 const mGpusPerNode
-
ModelConfig mModelConfig
-
std::optional<RuntimeDefaults> mRuntimeDefaults
-
inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism, SizeType32 gpusPerNode, ModelConfig modelConfig, std::optional<RuntimeDefaults> runtimeDefaults = std::nullopt)
-
class GptJsonConfig
-
namespace runtime
gptSession.h
-
namespace tensorrt_llm
-
namespace batch_manager
-
namespace kv_cache_manager
-
namespace kv_cache_manager
-
namespace runtime
-
class GptSession
-
Public Functions
-
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, RawEngine const &rawEngine, LoggerPtr logger = nullptr)
- Parameters:
sessionConfig – Configuration of the session,
modelConfig – Description of the model,
worldConfig – Description of the environment,
rawEngine – The compiled TensorRT engine,
logger – The optional logger.
-
inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
-
inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)
-
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)
-
BufferManager const &getBufferManager() const
-
BufferManager::CudaStreamPtr getRuntimeStreamPtr() const
-
inline ModelConfig const &getModelConfig() const
-
inline WorldConfig const &getWorldConfig() const
-
inline int getDevice() const noexcept
-
inline bool getNormalizeLogProbs() const noexcept
This function performs the generation loop.
Given input tensors to read from, output tensors to populate, that member function can be produced or each sequence has reached completion (due to the production will run the generation loop until it reaches the maximum number of tokens that of “end-of-sequence” or a word in the list of “stop words”). The pseudo-code of that function looks like (member function names were changed to keep the presentation simple):
// Have all the sequences in the batch reached completion? bool allFinished = false; // Until all sequences are finished or the number of steps reaches the limit... for (int step = 0; !allFinished && step < maxNewTokens; ++step) { // Trigger the computation of the logits... computeLogits(...); // Run the sampling to produce a token (for each active sequence) from the logits. allFinished = generateTokensFromLogits(...); // Callback to stream the output tokens while the generation loop continues. onTokenGenerated(...); }
-
void setLayerProfiler()
Set LayerProfiler to collect performance per layer.
-
std::string getLayerProfileInfo() const
Print profile information per layer.
Private Types
-
using BaseKVCacheManager = batch_manager::kv_cache_manager::BaseKVCacheManager
-
using KvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig
-
using TokenGeneratedCallback = std::function<void(SizeType32 step, bool finished)>
Private Functions
-
inline bool useCudaGraphs()
-
void createContexts()
-
void createBuffers(SizeType32 numMicroBatches)
-
void createDecoders(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType32 numMicroBatches, executor::DecodingMode const &decodingMode)
-
void createKvCacheManager(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, KvCacheConfig const &config)
-
void createCustomAllReduceWorkspace(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSequenceLength)
-
void executeContextStep(std::vector<GenerationInput> const &generationBatchesInputs, std::vector<SizeType32> const &generationBatchesOffsets, BaseKVCacheManager const *kvCacheManager)
-
SizeType32 executeGenerationStep(SizeType32 step, std::vector<GenerationInput> const µBatchesInputs, std::vector<GenerationOutput> µBatchesOutputs, std::vector<SizeType32> const µBatchOffsets, BaseKVCacheManager *kvCacheManager, std::vector<bool> µBatchesFinished)
-
void decoderStepAsync(SizeType32 decoderStep, SizeType32 microBatchId)
Execute decoder on last PP rank, receive decoder output on other PP ranks.
-
bool shouldStopSync(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 microBatchId)
Synchronize with the decoder and return the
shouldStop
flag.
-
void finalize(SizeType32 microBatchId, SamplingConfig const &samplingConfig)
Collect final output ids and log probs on last PP rank and send them to first PP rank.
Receives are asynchronous on host, so synchronization is required before access.
-
void kvCacheAddSequences(SizeType32 beamWidth, SizeType32 microBatchId, SizeType32 firstBatchIdx)
-
ITensor::SharedPtr initDecoder(ITensor &outputIds, GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, SizeType32 microBatchId) const
Populate outputIds and return reference to newTokens tensor.
-
TokenGeneratedCallback createOnTokenGeneratedCallback(GenerationOutput &outputs)
-
bool shouldUseKVCacheManager() const
Private Members
-
ModelConfig const mModelConfig
-
WorldConfig const mWorldConfig
-
int mDevice = {-1}
-
std::shared_ptr<NcclCommunicator> mPipelineComm
-
std::shared_ptr<CudaStream> mCommStream
-
std::shared_ptr<AllReduceBuffers> mAllReduceBuffers
-
SizeType32 mDecoderMaxSequenceLength = {}
-
std::vector<SizeType32> mDecoderMaxAttentionWindowVec = {}
-
SizeType32 mDecoderMaxAttentionWindow = {}
-
SizeType32 mDecoderSinkTokenLength = {}
-
std::shared_ptr<TllmRuntime> mRuntime
-
std::shared_ptr<BaseKVCacheManager> mKvCacheManager
-
MicroBatchConfig mMicroBatchConfig
-
std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders
-
std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers
-
bool mCudaGraphMode = {false}
-
std::vector<CudaGraphExecutor> mCudaGraphInstances
-
bool mNormalizeLogProbs = true
Friends
- friend class batch_manager::TrtGptModelV1
-
class Config
- #include <gptSession.h>
Configuration for session execution and buffer sizes.
generate
may be called with batch size and beam width smaller than the configured parameters.maxBatchSize
will be divided by the number of micro batches to initialize each batch buffer.Public Functions
-
inline Config(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, float gpuWeightsPercent = 1.0)
Public Members
-
SizeType32 maxBatchSize
-
SizeType32 maxBeamWidth
-
SizeType32 maxSequenceLength
-
float gpuWeightsPercent
-
bool decoderPerRequest = {false}
-
bool cudaGraphMode = {false}
-
KvCacheConfig kvCacheConfig = {}
-
std::optional<SizeType32> ctxMicroBatchSize = std::nullopt
-
std::optional<SizeType32> genMicroBatchSize = std::nullopt
-
std::optional<executor::DecodingMode> decodingMode = std::nullopt
-
bool normalizeLogProbs = true
-
inline Config(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, float gpuWeightsPercent = 1.0)
-
class CudaGraphExecutor
Public Functions
-
CudaGraphExecutor() = default
-
inline ~CudaGraphExecutor()
-
inline bool hasInstance()
-
void clear()
-
void prepareNextGraph(TllmRuntime const &runtime, SizeType32 nextContextId)
-
void launch(CudaStream const &stream)
Private Functions
-
void create(cudaGraph_t const &graph)
-
bool update(cudaGraph_t const &graph)
-
void uploadToStream(CudaStream const &stream)
Private Members
-
cudaGraphExec_t mInstance
-
CudaGraphExecutor() = default
-
class GenerationProfiler
- #include <gptSession.h>
Optional profiler class to profile the generation phase of an inference request.
Public Static Attributes
-
static constexpr unsigned int flags = {cudaEventDefault}
-
static constexpr unsigned int flags = {cudaEventDefault}
-
class MicroBatchConfig
Public Functions
-
inline MicroBatchConfig()
-
explicit MicroBatchConfig(SizeType32 maxBatchSize, SizeType32 pipelineParallelism, std::optional<SizeType32> genMicroBatchSize, std::optional<SizeType32> ctxMicroBatchSize)
-
inline constexpr SizeType32 numCtxPerGen() const
-
inline constexpr SizeType32 getGenGraphId(SizeType32 flipFlopId, SizeType32 generationBatchId) const
flip-flop between 2 graph instances for each generation batch.
Public Members
-
SizeType32 numCtxBatches
-
SizeType32 numGenBatches
-
SizeType32 ctxBatchSize
-
SizeType32 genBatchSize
-
inline MicroBatchConfig()
-
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, RawEngine const &rawEngine, LoggerPtr logger = nullptr)
-
class GptSession
-
namespace batch_manager
iBuffer.h
-
template<>
struct MemoryTypeString<MemoryType::kGPU> Public Static Attributes
-
static auto constexpr value = "GPU"
-
static auto constexpr value = "GPU"
-
template<>
struct MemoryTypeString<MemoryType::kCPU> Public Static Attributes
-
static auto constexpr value = "CPU"
-
static auto constexpr value = "CPU"
-
template<>
struct MemoryTypeString<MemoryType::kPINNED> Public Static Attributes
-
static auto constexpr value = "PINNED"
-
static auto constexpr value = "PINNED"
-
template<>
struct MemoryTypeString<MemoryType::kUVM> Public Static Attributes
-
static auto constexpr value = "UVM"
-
static auto constexpr value = "UVM"
-
template<>
struct MemoryTypeString<MemoryType::kPINNEDPOOL> Public Static Attributes
-
static auto constexpr value = "PINNEDPOOL"
-
static auto constexpr value = "PINNEDPOOL"
-
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32> Public Types
-
using type = std::int32_t
-
using type = std::int32_t
-
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64> Public Types
-
using type = std::int64_t
-
using type = std::int64_t
-
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32, true> Public Types
-
using type = std::uint32_t
-
using type = std::uint32_t
-
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64, true> Public Types
-
using type = std::uint64_t
-
using type = std::uint64_t
-
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned> Public Types
-
using type = bool
-
using type = bool
-
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned> Public Types
-
using type = std::uint8_t
-
using type = std::uint8_t
-
template<>
struct TRTDataType<std::int8_t>
-
template<>
struct TRTDataType<std::int32_t>
-
template<>
struct TRTDataType<std::uint32_t> Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
-
template<>
struct TRTDataType<std::int64_t>
-
template<>
struct TRTDataType<std::uint64_t> Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
template<>
struct TRTDataType<std::uint8_t>
-
template<>
struct TRTDataType<kernels::KVCacheIndex> Public Static Attributes
-
static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value
-
static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value
-
template<>
struct TRTDataType<kernels::FinishedState> Public Static Attributes
-
static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value
-
static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value
-
template<>
struct TRTDataType<runtime::RequestType> Public Static Attributes
-
static constexpr auto value = TRTDataType<std::underlying_type_t<runtime::RequestType>>::value
-
static constexpr auto value = TRTDataType<std::underlying_type_t<runtime::RequestType>>::value
-
namespace tensorrt_llm
-
namespace runtime
Typedefs
Enums
Functions
-
template<typename T>
T const *bufferCast(IBuffer const &buffer) Gets a typed pointer to the constant underlying data of the buffer.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
buffer – The buffer to get a pointer to.
- Returns:
A pointer to constant
T
.
-
template<typename T>
T *bufferCast(IBuffer &buffer) Gets a typed pointer to the underlying data of the buffer.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
buffer – The buffer to get a pointer to.
- Returns:
A pointer to
T
.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
bufferPtr – A possibly null shared ptr.
- Returns:
A pointer to T, possibly nullptr.
Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
bufferPtr – A possibly null shared ptr.
- Returns:
A pointer to const T, possibly nullptr.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
optionalBufferPtr – A possibly empty optional.
- Returns:
A pointer to T, possibly nullptr.
Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
optionalBufferPtr – A possibly empty optional.
- Returns:
A pointer to const T, possibly nullptr.
-
class BufferDataType
- #include <iBuffer.h>
A wrapper around
nvinfer1::DataType
that provides a support for pointer types.
-
template<typename T>
class BufferRange : public tensorrt_llm::common::ArrayView<T> Public Types
-
using Base = tensorrt_llm::common::ArrayView<T>
-
using Base = tensorrt_llm::common::ArrayView<T>
-
template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false>
struct DataTypeTraits - #include <iBuffer.h>
For converting a TensorRT data type to a C++ data type.
-
template<nvinfer1::DataType kDataType, bool kUnsigned>
struct DataTypeTraits<kDataType, kUnsigned, true>
- template<bool kUnsigned> kBOOL, kUnsigned >
Public Types
-
using type = bool
-
using type = bool
- template<> kFLOAT >
Public Types
-
using type = float
Public Static Attributes
-
static char constexpr name[] = "float"
-
static auto constexpr size = sizeof(type)
-
using type = float
- template<> kHALF >
Public Types
-
using type = half
Public Static Attributes
-
static char constexpr name[] = "half"
-
static auto constexpr size = sizeof(type)
-
using type = half
- template<> kINT32 >
Public Types
-
using type = std::int32_t
Public Static Attributes
-
static char constexpr name[] = "int32"
-
static auto constexpr size = sizeof(type)
-
using type = std::int32_t
- template<> kINT32, true >
Public Types
-
using type = std::uint32_t
Public Static Attributes
-
static char constexpr name[] = "uint32"
-
static auto constexpr size = sizeof(type)
-
using type = std::uint32_t
- template<> kINT64 >
Public Types
-
using type = std::int64_t
Public Static Attributes
-
static char constexpr name[] = "int64"
-
static auto constexpr size = sizeof(type)
-
using type = std::int64_t
- template<> kINT64, true >
Public Types
-
using type = std::uint64_t
Public Static Attributes
-
static char constexpr name[] = "uint64"
-
static auto constexpr size = sizeof(type)
-
using type = std::uint64_t
- template<> kINT8 >
Public Types
-
using type = std::int8_t
Public Static Attributes
-
static char constexpr name[] = "int8"
-
static auto constexpr size = sizeof(type)
-
using type = std::int8_t
- template<bool kUnsigned> kUINT8, kUnsigned >
Public Types
-
using type = std::uint8_t
Public Static Attributes
-
static char constexpr name[] = "uint8"
-
static auto constexpr size = sizeof(type)
-
using type = std::uint8_t
-
class IBuffer
Subclassed by tensorrt_llm::runtime::ITensor
Public Types
Public Functions
-
virtual void *data() = 0
Returns a pointer to underlying array.
-
virtual void const *data() const = 0
Returns a pointer to underlying array.
-
inline virtual void *data(std::size_t index)
Returns a pointer to the underlying array at a given element index.
-
inline virtual void const *data(std::size_t index) const
Returns a pointer to the underlying array at a given element index.
-
virtual std::size_t getSize() const = 0
Returns the size (in number of elements) of the buffer.
-
inline virtual std::size_t getSizeInBytes() const
Returns the size (in bytes) of the buffer.
-
virtual std::size_t getCapacity() const = 0
Returns the capacity of the buffer.
-
virtual char const *getDataTypeName() const
-
virtual MemoryType getMemoryType() const = 0
Returns the memory type of the buffer.
-
virtual char const *getMemoryTypeName() const
-
virtual void resize(std::size_t newSize) = 0
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-
virtual void release() = 0
Releases the buffer. It will be reset to nullptr.
-
virtual ~IBuffer() = default
Public Static Functions
Creates a sliced view on the underlying
buffer
. The view will have the same data type asbuffer
.- Parameters:
buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.
- Returns:
A view on the
buffer
.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
Returns a view on the underlying
tensor
which can be independently resized.- Parameters:
tensor – The tensor to view.
- Returns:
A view on the
tensor
.
Returns a view on the underlying
tensor
with a different size.- Parameters:
tensor – The tensor to view.
size – The size of the view.
- Returns:
A view on the
tensor
.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, std::size_t size)
-
static UniquePtr wrap(void *data, DataType type, std::size_t size, std::size_t capacity)
Wraps the given
data
in anIBuffer
. TheIBuffer
will not own the underlyingdata
and cannot be resized beyondcapacity
.- Parameters:
data – The data to wrap.
type – The data type of the
data
.size – The size of the buffer.
capacity – The capacity of the buffer.
- Returns:
An
IBuffer
.
-
static MemoryType memoryType(void const *data)
Determine the memory type of a pointer.
-
virtual void *data() = 0
-
template<MemoryType T>
struct MemoryTypeString
- template<> kCPU >
Public Static Attributes
-
static auto constexpr value = "CPU"
-
static auto constexpr value = "CPU"
- template<> kGPU >
Public Static Attributes
-
static auto constexpr value = "GPU"
-
static auto constexpr value = "GPU"
- template<> kPINNED >
Public Static Attributes
-
static auto constexpr value = "PINNED"
-
static auto constexpr value = "PINNED"
- template<> kPINNEDPOOL >
Public Static Attributes
-
static auto constexpr value = "PINNEDPOOL"
-
static auto constexpr value = "PINNEDPOOL"
- template<> kUVM >
Public Static Attributes
-
static auto constexpr value = "UVM"
-
static auto constexpr value = "UVM"
-
template<typename T, bool = false>
struct TRTDataType - #include <iBuffer.h>
For converting a C++ data type to a TensorRT data type.
-
template<>
struct TRTDataType<bool>
-
template<>
struct TRTDataType<float>
-
template<>
struct TRTDataType<half>
- template<> FinishedState >
Public Static Attributes
-
static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value
-
static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value
- template<> KVCacheIndex >
Public Static Attributes
-
static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value
-
static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value
- template<> RequestType >
Public Static Attributes
-
static constexpr auto value = TRTDataType<std::underlying_type_t<runtime::RequestType>>::value
-
static constexpr auto value = TRTDataType<std::underlying_type_t<runtime::RequestType>>::value
- template<> int32_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kINT32
-
static constexpr auto value = nvinfer1::DataType::kINT32
- template<> int64_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kINT64
-
static constexpr auto value = nvinfer1::DataType::kINT64
- template<> int8_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kINT8
-
static constexpr auto value = nvinfer1::DataType::kINT8
- template<> uint32_t >
Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
- template<> uint64_t >
Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
- template<> uint8_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kUINT8
-
static constexpr auto value = nvinfer1::DataType::kUINT8
-
template<typename T>
struct TRTDataType<T*> Public Static Attributes
-
static auto constexpr value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}
Private Static Attributes
-
static auto constexpr kUnderlyingType = BufferDataType{TRTDataType<std::remove_const_t<T>, false>::value}
-
static auto constexpr value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}
-
template<>
struct TRTDataType<void*> Public Static Attributes
-
static constexpr auto value = BufferDataType::kTrtPointerType
-
static constexpr auto value = BufferDataType::kTrtPointerType
-
template<typename T>
-
namespace runtime
iGptDecoderBatched.h
-
namespace tensorrt_llm
-
namespace runtime
-
class IGptDecoderBatched : public virtual tensorrt_llm::runtime::IStatefulGptDecoder
- #include <iGptDecoderBatched.h>
GPT decoder class with support for in-flight batching.
Subclassed by tensorrt_llm::runtime::GptDecoderBatched
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
using DecoderFinishedEventPtr = std::unique_ptr<decoder_batch::DecoderFinishedEvent const>
Public Functions
-
virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) = 0
Setup buffers for ExplicitDraftTokens decoding.
-
virtual void setupEagle(EagleBuffers::Inputs eagleBuffers) = 0
Setup buffers for Eagle decoding.
-
virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) = 0
Setup buffers for Lookahead decoding.
-
virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) = 0
Run one step for all requests without blocking the host process and return the token for synchronization.
-
virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &token, decoder_batch::Output &output, decoder_batch::Input const &input) = 0
Call decoder forwardSync and wait for the call to
forwardAsync
associated with a token to complete.
-
virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &token) = 0
Wait for the call to
forwardAsync
associated with a token to complete.
-
inline virtual void forward(decoder_batch::Output &output, decoder_batch::Input const &input)
Run one step for all requests and wait for completion on the host.
-
virtual TensorPtr getIds(SizeType32 batchIdx) const = 0
- Parameters:
batchIdx – index of the batch
- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
batchIdx
, on gpu
-
virtual TensorPtr getGatheredIds(SizeType32 batchIdx) const = 0
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search in GptDecoderBatched It contains gathered token ids without padding, on gpu
-
virtual CudaEvent finalize(SizeType32 batchIdx, SamplingConfig const &samplingConfig, bool streaming) const = 0
Gather final beam search results for request
batchIdx
. Result will only be available after event returned.
-
virtual std::vector<bool> getFinished() const = 0
- Returns:
[batchSize (actual)], marks finished requests (per batch)
-
virtual TensorPtr getFinishReasons() const = 0
- Returns:
[batchSize, beamWidth], FinishedState value, on gpu
-
virtual TensorPtr getCumLogProbs() const = 0
- Returns:
[batchSize, beamWidth], cumulative log probabilities (per beam), on gpu
-
virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const = 0
- Returns:
[beamWidth], cumulative log probabilities (per beam) for request batchIdx, on gpu
-
virtual TensorPtr getLogProbs() const = 0
- Returns:
[batchSize, beamWidth, maxSeqLen], log probabilities (per beam), on gpu
-
virtual TensorPtr getLogProbs(SizeType32 batchIdx) const = 0
- Returns:
[beamWidth, maxSeqLen], cumulative log probabilities (per beam) for request batchIdx, on gpu
-
virtual std::vector<SizeType32> getNbSteps() const = 0
-
virtual executor::DecodingMode getDecodingMode() const = 0
-
virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs, ModelConfig const &modelConfig) = 0
Initialize batched decoder at seqSlots with a new
requests
.
-
virtual TensorPtr getNextDraftTokens() const = 0
- Returns:
[batchSize, maxTokensPerStep-1], predicted draft tokens for next step, on gpu
-
virtual TensorPtr getPrevDraftTokensLengths() const = 0
- Returns:
[batchSize], predicted draft tokens lengths for previous step, on gpu
-
virtual TensorPtr getNextDraftTokensLengths() const = 0
- Returns:
[batchSize], predicted draft tokens lengths for next step, on gpu
Protected Functions
-
IGptDecoderBatched() = default
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
namespace decoder_batch
-
-
class DecoderFinishedEvent
Public Functions
-
class Input
-
Public Functions
Public Members
-
std::vector<bool> active
-
std::optional<ExplicitDraftTokensBuffers::EngineOutputs> explicitDraftTokensInputs
-
std::optional<ExplicitDraftTokensBuffers::EngineInputs> explicitDraftTokensLastInputs
-
std::optional<EagleBuffers::EngineOutputs> eagleInputs
-
std::optional<EagleBuffers::Inputs> eagleLastInputs
-
std::vector<bool> active
-
class DecoderFinishedEvent
-
class IGptDecoderBatched : public virtual tensorrt_llm::runtime::IStatefulGptDecoder
-
namespace runtime
iStatefulGptDecoder.h
-
namespace tensorrt_llm
-
namespace batch_manager
-
namespace runtime
-
class IStatefulGptDecoder
- #include <iStatefulGptDecoder.h>
GPT decoder class with support for in-flight batching.
Subclassed by tensorrt_llm::runtime::IGptDecoderBatched
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
Public Functions
-
virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) = 0
Setup the decoder before calling
forward()
, also calls reshapeBuffers.
-
virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, ModelConfig const &modelConfig) = 0
Initialize the decoder with new batch of inputs.
-
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0
Run one step for all requests without blocking the host thread.
-
virtual void forwardSync() = 0
Wait for the last call to
forwardAsync
to complete.
-
inline virtual void forward(decoder::Output &output, decoder::Input const &input)
Run one step for all requests.
-
virtual void finalize(SamplingConfig const &samplingConfig) const = 0
Gather final beam search results for all requests.
-
virtual TensorPtr getIds() const = 0
- Returns:
[batchSize, beamWidth, maxSequenceLength], all token ids, on gpu
-
virtual TensorPtr getGatheredIds() const = 0
- Returns:
[batchSize, beamWidth, maxSequenceLength] token ids after gatherTree
-
virtual TensorPtr getCumLogProbs() const = 0
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
virtual TensorPtr getLogProbs() const = 0
- Returns:
[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
virtual TensorPtr getNewTokens(SizeType32 iter = 0) const = 0
Get tokens generated in one step of last forward pass.
- Parameters:
iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
- Returns:
[batchSize, beamWidth], tokens generated in
iter
(per beam), on gpu
-
virtual TensorPtr getAllNewTokens() const = 0
Get maxTokensPerStep tokens generated in the last forward pass.
- Returns:
[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
-
virtual TensorPtr getNbFinished() const = 0
- Returns:
[1], number of finished sequences, in pinned host memory
-
virtual ~IStatefulGptDecoder() = default
Protected Functions
-
IStatefulGptDecoder() = default
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
class IStatefulGptDecoder
-
namespace batch_manager
iTensor.h
-
namespace nvinfer1
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims)
Utility function to print a shape.
-
std::ostream &operator<<(std::ostream &output, ITensor const &tensor)
Utility function to print a tensor with its shape.
Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
tensorPtr – A possibly null shared ptr.
- Returns:
A pointer to T const, possibly nullptr.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
tensorPtr – A possibly null shared ptr.
- Returns:
A pointer to T, possibly nullptr.
Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
optionalBufferPtr – A possibly empty optional.
- Returns:
A pointer to T, possibly nullptr.
Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
optionalBufferPtr – A possibly empty optional.
- Returns:
A pointer to const T, possibly nullptr.
-
class ITensor : public virtual tensorrt_llm::runtime::IBuffer
-
Public Functions
-
~ITensor() override = default
-
template<SizeType32 n>
inline DimType64 getDimension() const Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.
-
virtual void reshape(Shape const &dims) = 0
Sets the tensor dimensions. The new size of the tensor will be
volume(dims)
-
inline virtual void resize(std::size_t newSize) override
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-
inline void squeeze(SizeType32 dim)
Removes the given unit dimensions from this tensor.
-
inline void unsqueeze(SizeType32 dim)
Adds a unit dimension at the specified position.
-
inline bool shapeEquals(std::initializer_list<SizeType32> const &other) const
-
template<typename T>
inline bool shapeEquals(T const *dims, SizeType32 count) const
Public Static Functions
-
static inline std::int64_t volume(Shape const &dims)
Returns the volume of the dimensions. Returns -1 if
d.nbDims < 0
.
-
static inline std::size_t volumeNonNegative(Shape const &shape)
Returns the volume of the dimensions. Throws if
d.nbDims < 0
.
-
static Shape squeeze(Shape const &shape, SizeType32 dim)
Removes the given unit dimension from
shape
.- Parameters:
shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).
- Returns:
A new shape without the unit dimension.
-
static Shape unsqueeze(Shape const &shape, SizeType32 dim)
Add a unit dimension to
shape
at the specified position.- Parameters:
shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.
- Returns:
A new shape with the added unit dimension.
Creates a sliced view on the underlying
tensor
. The view will have the same data type astensor
.- Parameters:
tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.
- Returns:
A view on the
buffer
.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
- Parameters:
offsetDims – The offset in multiple dimensions.
tensor – The tensor to view.
offsetDims – The offset dimensions of the view.
size – The size of the view w.r.t. the last dimension in offsetDims.
offsetDims – specifies all dimensions.
- Throws:
Whenever – offset overflows or the last dimension offset+size overflows.
- Returns:
A view of shape [size, the rest dimensions] or [size] when
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims, std::size_t size)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims, std::size_t size)
return the rest slices at the last dimension when
size
omitted.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)
- Parameters:
offsetDims – specifies all dimensions.
- Returns:
Just the block at the point, with shape of [the rest dimensions] or [1] when
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr at(TConstPtr &&tensor, Shape const &offsetDims)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline ITensor::UniqueConstPtr at(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)
Returns a view on the underlying
buffer
(or tensor) with the given shape.- Parameters:
tensor – The tensor to view.
shape – The shape of the view.
- Returns:
A view on the
tensor
.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)
Returns a view on the underlying
tensor
which can be independently reshaped.- Parameters:
tensor – The tensor to view.
- Returns:
A view on the
tensor
.
Returns a flattened view on the underlying
tensor
which can be independently reshaped.- Parameters:
tensor – The tensor to flatten.
sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.
- Returns:
A flatten view on the
tensor
.
-
static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)
Wraps the given
data
in anITensor
. TheITensor
will not own the underlyingdata
and cannot be reshaped beyondcapacity
.- Parameters:
data – The data to wrap.
type – The data type of the
data
.shape – The shape of the tensor.
capacity – The capacity of the buffer.
- Returns:
An
ITensor
.
-
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)
-
static Shape makeShape(std::initializer_list<DimType64> const &dims)
A convenience function to create a tensor shape with the given dimensions.
-
static std::string toString(Shape const &dims)
A convenience function for converting a tensor shape to a
string
.
-
static inline bool shapeEquals(Shape const &lhs, Shape const &rhs)
A convenience function to compare shapes.
-
template<typename T>
static inline bool shapeEquals(Shape const &lhs, T const *dims, SizeType32 count) A convenience function to compare shapes.
Protected Functions
-
ITensor() = default
Friends
- friend class ITensorBindings
-
~ITensor() override = default
-
inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims)
-
namespace runtime
ipcUtils.h
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
void lamportInitializeAll(void *buffer_0, void *buffer_1, void *buffer_2, size_t size)
-
class AllReduceBuffers
-
Public Functions
-
AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, SizeType32 hiddenSize, BufferManager const &manager, WorldConfig const &worldConfig, bool const fakeBuffers = false)
-
AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, SizeType32 hiddenSize, BufferManager const &manager, WorldConfig const &worldConfig, bool const fakeBuffers = false)
-
class IpcMemory
-
Public Functions
-
IpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig, bool openIpc = true)
-
~IpcMemory()
-
inline std::vector<void*> const &getCommPtrs() const
Public Static Attributes
-
static size_t constexpr FLAGS_SIZE = (tensorrt_llm::kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)
Private Functions
-
void allocateIpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig)
-
void destroyIpcMemory()
-
IpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig, bool openIpc = true)
-
void lamportInitializeAll(void *buffer_0, void *buffer_1, void *buffer_2, size_t size)
-
namespace runtime
lookaheadBuffers.h
-
namespace tensorrt_llm
-
namespace runtime
-
class LookaheadDecodingBuffers
Public Types
-
using SizeType32 = runtime::SizeType32
-
using ITensor = tensorrt_llm::runtime::ITensor
Public Functions
-
LookaheadDecodingBuffers(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep, runtime::BufferManager const &bufferManager)
-
using SizeType32 = runtime::SizeType32
-
class LookaheadRuntimeBuffers
Public Types
-
using SizeType32 = tensorrt_llm::runtime::SizeType32
-
using ITensor = tensorrt_llm::runtime::ITensor
-
using TensorMap = runtime::StringPtrMap<runtime::ITensor>
Public Functions
-
LookaheadRuntimeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, executor::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)
-
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const &requestTypes, ITensor const &seqSlots, LookaheadDecodingBuffers const &decoderLookaheadBuffers, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const
-
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 tokensPerStep)
-
void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const
Public Members
-
using SizeType32 = tensorrt_llm::runtime::SizeType32
-
class LookaheadDecodingBuffers
-
namespace runtime
lookaheadModule.h
-
namespace tensorrt_llm
-
namespace runtime
-
class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule
Public Functions
-
inline explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
-
inline explicit LookaheadModule() noexcept
-
inline void setExecutionConfig(executor::LookaheadDecodingConfig const &config)
-
inline executor::LookaheadDecodingConfig const getExecutionConfig() const
Private Members
-
executor::LookaheadDecodingConfig mExecutionConfig
-
inline explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
-
class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule
-
namespace runtime
loraCache.h
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
std::string to_string(LoraCache::TaskLayerModuleConfig const &v)
-
std::ostream &operator<<(std::ostream &os, LoraCache::TaskLayerModuleConfig const &v)
-
class LoraCache
- #include <loraCache.h>
Caches LoRA weights with LRU eviction policy.
Tasks put in the cache are marked in progress and can not be evicted, until they are marked done.
A cache page holds a optimally sized LoRA. A page is of size [numSlots x pageWidth] An optimally size LoRA is on that has the configured optimalAdapterSize.
Conceptually a slot corresponds to a r=1, 1-layer, 1-module set of in/out weights. Page width is set to the number of weights in smallest module.
The number of slots per page is then ceilDiv(num weights in optimally sized LoRA, num weights in smallest module)
Cache pages are allocated on one or more blocks
Public Types
-
using TaskIdType = std::uint64_t
-
using TaskLayerModuleConfigListPtr = std::shared_ptr<std::vector<TaskLayerModuleConfig>>
Public Functions
-
LoraCache(LoraCachePageManagerConfig const &pageManagerConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, BufferManager const &bufferManager)
param[in] pageManagerConfig: a LoraCachePageManagerConfig param[in] modelConfig: a ModelConfig param[in] worldConfig: a WorldConfig param[in] bufferManager: a BufferManager only used to allocate page blocks
-
void put(TaskIdType taskId, TensorPtr weights, TensorPtr config, bool load = true)
put a task in the cache, and claim pages for it, and optionally load task weights.
- Parameters:
taskId – [in] the task id
weights – [in] lora weights tensor
config – [in] lora config tensor
load – [in] if true load weights before returning, otherwise do not
-
void loadWeights(TaskIdType taskId, TensorPtr weights, TensorPtr config)
load task weights. This method must be called after put. It is designed to be called asynchronously after put returns with load = false
- Parameters:
taslId – [in] the task id
weights – [in] lora weights tensor
config – [in] lora config tensor
-
inline bool isLoaded(TaskIdType taskId) const
- Parameters:
taskId – [in] the task id
- Returns:
— true if task is loaded (weights are in place) and false otherwise
-
bool isDone(TaskIdType taskId) const
- Parameters:
taskId – [in] the task id
- Returns:
— true if task is marked done and can be evicted
-
inline bool has(TaskIdType taskId) const
- Parameters:
taskId – [in] the task id
- Returns:
— true if task is in the cache (not necessarily loaded) and false otherwise
-
std::vector<TaskLayerModuleConfig> const &get(TaskIdType taskId)
- Parameters:
taskId – [in] the task id
- Returns:
— list of Value objects with pointers to task weights
-
void bump(TaskIdType taskId)
bump task and make it the most recently used
- Parameters:
taskId – [in] the task id
-
void markTaskDone(TaskIdType taskId)
mark task done meaning it can be evicted
- Parameters:
taskId – [in] the task id
-
void markAllDone()
mark all tasks in cache done
-
SizeType32 determineNumPages(TaskIdType taskId) const
- Parameters:
taskId – [in] the taskid
- Returns:
— number of pages needed to store the given task
-
SizeType32 determineNumPages(TensorPtr config) const
- Parameters:
config – [in] lora config tensor
- Returns:
— number of pages needed to store the task configured with config tensor
-
bool fits(TensorPtr config) const
- Parameters:
config – [in] a lora config tensor
- Returns:
— true in task fits in cache false otherwise
-
void copyTask(TaskIdType taskId, LoraCache &deviceCache, bool markDone = false)
copy task to another cache. Caches must have the same page size.
- Parameters:
taskId – [in] the task id to copy
otherCache – [in] the LoraCache to move the task to
markDone – [in] mark the copied task done as it’s copied
-
SizeType32 getNumPages() const
- Returns:
— total number of pages allocated to cache (used or not)
-
ITensor::SharedConstPtr getPagePtr(size_t pageId) const
- Parameters:
pageId – [in] the page id
- Returns:
— const pointer to page
Public Static Functions
-
static std::vector<LoraCache::TaskLayerModuleConfig> copyToPages(TensorPtr weights, TensorPtr config, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::unordered_map<SizeType32, LoraModule> moduleIdToModel, BufferManager const &manager, std::vector<TensorPtr> const &pages, std::vector<std::size_t> const &pageIds)
Copy task weights to cache pages.
- Parameters:
weights – [in] task weights
config – [in] task config tensor
modelConfig – [in] a ModelConfig
worldConfig – [in] a WorldConfig
modelIdToModel – [in] map from lora module id to LoraModule
manager – [in] a BufferManager the manager to use to perform the copies
pages – [out] list of page tensors to copy weights to
pageIds – [in] page ids for the pages
- Returns:
— list of cache Values objects
-
static void splitTransposeCpu(ITensor &output, ITensor const &input, SizeType32 tpSize, SizeType32 tpRank)
splits second dim of input into tpSize parts and writes the tpRank split to output
- Parameters:
output – [out] output tensor
input – [in] input tensor
tpSize – [in] number of splits
tpRank – [in] the split to write to output
Private Types
Private Functions
-
void bumpTaskInProgress(TaskIdType taskId)
-
ValueStatus getStatus(TaskIdType taskId) const
-
std::vector<std::size_t> claimPagesWithEvict(SizeType32 numPages)
claim numPages, evicting tasks if needed
- Parameters:
numPages – [in] number of pages to claim
- Throws:
std::runtime_error – if all pages cannot be claimed
- Returns:
— list of page ids
-
std::map<size_t, std::pair<size_t, SizeType32>> copyTaskMapPages(TaskValue &targetTaskValue, TaskValue const &sourceTaskValue, std::vector<size_t> const &targetPageIds, LoraCache const &targetCache)
Internal helper method used inside copyTask. Not thread safe on its own
Private Members
-
LoraCachePageManagerConfig mPageManagerConfig
-
ModelConfig mModelConfig
-
WorldConfig mWorldConfig
-
mutable std::mutex mPagesMutex
-
std::unique_ptr<LoraCachePageManager> mCachePageManager
-
mutable std::mutex mCacheMutex
-
std::unordered_map<TaskIdType, TaskValuePtr> mCacheMap
-
std::list<TaskIdType> mInProgressTasks
-
std::list<TaskIdType> mDoneTasks
-
std::vector<std::unique_ptr<BufferManager>> mDeviceBufferManagers
-
std::unique_ptr<BufferManager> mBufferManager
-
std::unordered_map<SizeType32, LoraModule> mModuleIdToModule
Private Static Functions
-
template<typename T>
static void splitTransposeCpuInner(ITensor &output, ITensor const &input, SizeType32 tpSize, SizeType32 tpRank)
-
struct TaskLayerModuleConfig
- #include <loraCache.h>
Contains information on a single layer / module. A list of these configs is associated with each task and can be used to populate runtime tensors.
Public Functions
-
std::string toString() const
-
bool operator==(LoraCache::TaskLayerModuleConfig const &o) const
Public Members
-
std::size_t pageId
-
SizeType32 slotIdx
-
SizeType32 inSize
-
SizeType32 outSize
-
SizeType32 moduleId
-
SizeType32 layerId
-
SizeType32 adapterSize
-
SizeType32 numSlots
-
std::int64_t weightsInPointer
-
std::int64_t weightsOutPointer
Friends
- friend class TaskLayerModuleConfigBindings
-
std::string toString() const
-
struct TaskValue
Holds configuration and state for a single task.
Public Functions
-
TaskValue() = delete
-
~TaskValue() = default
-
inline TaskValue(std::vector<std::size_t> const &pageIds, TaskLayerModuleConfigListPtr const &configs, std::list<TaskIdType>::iterator it, bool inProgress, bool loaded, bool done, bool loadInProgress = false)
Public Members
-
std::vector<std::size_t> pageIds
-
TaskLayerModuleConfigListPtr configs
-
std::list<TaskIdType>::iterator it
-
bool inProgress
-
bool loaded
-
bool done
Marks a task a done. This is used to mark a task as done during loading. if done=true at the end of loading (end of put, loadweights, or copyTask) the task will be marked as done
-
bool loadInProgress
Indicates weights are loading either in put or loadWeights This is used to block concurrent loadWeights calls for the same task.
-
TaskValue() = delete
-
using TaskIdType = std::uint64_t
-
class LoraCacheFullException : public tensorrt_llm::runtime::LoraExpectedException
-
class LoraCachePageManager
- #include <loraCache.h>
Holds memory of lora cache pages, and manages allocation and freeing of whole pages. Memory is pre-allocated either on the host or device
Note that this class is not thread safe
Public Functions
-
LoraCachePageManager(LoraCachePageManagerConfig const &config, BufferManager const &bufferManager)
- Parameters:
config – [in] a LoraCachePageManagerConfig
bufferManager – [in] a Buffermanager used to allocate page blocks
-
std::optional<std::vector<std::size_t>> claimPages(SizeType32 numPages)
claim pages
- Parameters:
numPages – [in] number of pages to claim
- Returns:
a tuple, where the first values is a boolean indicating whether pages were claimed. If the first value is true the second value will have a list of pageIds
-
SizeType32 numAvailablePages() const
get number of available (free) pages in manager
- Returns:
number of free pages in manager
-
void releasePages(std::vector<std::size_t> const &pages)
release given pages
- Parameters:
pages – [in] list of pages to release (free)
-
ITensor::SharedConstPtr blockPtr(SizeType32 blockIdx) const
return pointer to given page block
- Parameters:
blockIdx; – [in]
- Returns:
— pointer to page block
-
ITensor::SharedConstPtr pagePtr(std::size_t pageIdx) const
return pointer to given page
- Parameters:
pageIdx – [in]
- Returns:
— const pointer to page
Private Functions
-
void initialize(BufferManager const &bufferManager)
-
LoraCachePageManager(LoraCachePageManagerConfig const &config, BufferManager const &bufferManager)
-
class LoraExpectedException : public std::runtime_error
Subclassed by tensorrt_llm::runtime::LoraCacheFullException
-
std::string to_string(LoraCache::TaskLayerModuleConfig const &v)
-
namespace runtime
loraCachePageManagerConfig.h
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
inline std::ostream &operator<<(std::ostream &os, LoraCachePageManagerConfig const &c)
-
inline std::string to_string(LoraCachePageManagerConfig const &c)
-
class LoraCachePageManagerConfig
- #include <loraCachePageManagerConfig.h>
Configuration for LoraCachePageManager
See LoraCache docs for description of pages, slots, and page blocks.
Public Functions
-
inline explicit constexpr LoraCachePageManagerConfig(runtime::MemoryType memType, nvinfer1::DataType dType, SizeType32 totalNumPages, SizeType32 maxPagesPerBlock, SizeType32 slotsPerPage, SizeType32 pageWidth, SizeType32 numCopyStreams)
-
inline runtime::MemoryType constexpr getMemoryType() const noexcept
-
inline void constexpr setMemoryType(runtime::MemoryType const &memoryType) noexcept
-
inline SizeType32 constexpr getTotalNumPages() const noexcept
-
inline void constexpr setTotalNumPage(SizeType32 const &totalNumPages) noexcept
-
inline SizeType32 constexpr getMaxPagesPerBlock() const noexcept
-
inline void constexpr setMaxPagesPerBlock(SizeType32 const &maxPagesPerBlock) noexcept
-
inline SizeType32 constexpr getSlotsPerPage() const noexcept
-
inline void constexpr setSlotsPerPage(SizeType32 const &slotsPerPage) noexcept
-
inline SizeType32 constexpr getPageWidth() const noexcept
-
inline void constexpr setPageWidth(SizeType32 const &pageWidth) noexcept
-
inline bool constexpr getInitToZero() const noexcept
-
inline void constexpr setInitToZero(bool initToZero) noexcept
-
inline SizeType32 constexpr getNumCopyStreams() const noexcept
-
inline void constexpr setNumCopyStreams(SizeType32 numCopyStreams) noexcept
Private Members
-
runtime::MemoryType mMemoryType
-
SizeType32 mTotalNumPages
-
SizeType32 mMaxPagesPerBlock
-
SizeType32 mSlotsPerPage
-
SizeType32 mPageWidth
-
SizeType32 mNumCopyStreams = 1
-
bool mInitToZero
-
inline explicit constexpr LoraCachePageManagerConfig(runtime::MemoryType memType, nvinfer1::DataType dType, SizeType32 totalNumPages, SizeType32 maxPagesPerBlock, SizeType32 slotsPerPage, SizeType32 pageWidth, SizeType32 numCopyStreams)
-
inline std::ostream &operator<<(std::ostream &os, LoraCachePageManagerConfig const &c)
-
namespace runtime
loraModule.h
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
inline std::ostream &operator<<(std::ostream &output, LoraModule const &module)
-
class LoraModule
Public Types
-
enum class ModuleType : SizeType32
Values:
-
enumerator kINVALID
-
enumerator kATTN_QKV
-
enumerator kATTN_Q
-
enumerator kATTN_K
-
enumerator kATTN_V
-
enumerator kATTN_DENSE
-
enumerator kMLP_H_TO_4H
-
enumerator kMLP_4H_TO_H
-
enumerator kMLP_GATE
-
enumerator kCROSS_ATTN_QKV
-
enumerator kCROSS_ATTN_Q
-
enumerator kCROSS_ATTN_K
-
enumerator kCROSS_ATTN_V
-
enumerator kCROSS_ATTN_DENSE
-
enumerator kMOE_H_TO_4H
-
enumerator kMOE_4H_TO_H
-
enumerator kMOE_GATE
-
enumerator kMOE_ROUTER
-
enumerator kMLP_ROUTER
-
enumerator kINVALID
Public Functions
-
inline explicit constexpr LoraModule(ModuleType const &t, SizeType32 inDim, SizeType32 outDim, bool inDimFirst, bool outDimFirst, SizeType32 inTpSplitDim, SizeType32 outTpSplitDim) noexcept
-
inline explicit constexpr LoraModule() noexcept
-
explicit constexpr LoraModule(LoraModule const &o) = default
-
constexpr LoraModule &operator=(LoraModule const &o) = default
-
inline SizeType32 constexpr flattenedInOutSize(SizeType32 adapterSize) const noexcept
-
inline SizeType32 constexpr inSize(SizeType32 adapterSize) const noexcept
-
inline SizeType32 constexpr outSize(SizeType32 adapterSize) const noexcept
-
inline SizeType32 constexpr localInSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
-
inline SizeType32 constexpr localOutSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
-
inline SizeType32 constexpr localInDim(SizeType32 tpSize) const noexcept
-
inline SizeType32 constexpr localOutDim(SizeType32 tpSize) const noexcept
-
inline SizeType32 constexpr localInAdapterSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
-
inline SizeType32 constexpr localOutAdapterSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
-
inline SizeType32 constexpr localInOutSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
-
inline SizeType32 constexpr value() const noexcept
-
inline std::string_view constexpr name() const noexcept
-
inline SizeType32 constexpr inDim() const noexcept
-
inline SizeType32 constexpr outDim() const noexcept
-
inline bool constexpr inDimFirst() const noexcept
-
inline bool constexpr outDimFirst() const noexcept
-
inline SizeType32 constexpr inTpSplitDim() const noexcept
-
inline SizeType32 constexpr outTpSplitDim() const noexcept
Public Static Functions
-
static std::vector<LoraModule> createLoraModules(std::vector<std::string> const &loraModuleNames, SizeType32 hiddenSize, SizeType32 mlpHiddenSize, SizeType32 numAttentionHeads, SizeType32 numKvAttentionHeads, SizeType32 attentionHeadSize, SizeType32 tpSize, SizeType32 numExperts)
-
static inline ModuleType constexpr toModuleType(std::string_view const &name)
-
static inline std::string_view constexpr toModuleName(ModuleType t) noexcept
-
static inline std::string_view constexpr toModuleName(SizeType32 id)
Private Members
-
ModuleType mType
-
SizeType32 mInDim
-
SizeType32 mOutDim
-
bool mInDimFirst
-
bool mOutDimFirst
-
SizeType32 mInTpSplitDim
-
SizeType32 mOutTpSplitDim
-
enum class ModuleType : SizeType32
-
inline std::ostream &operator<<(std::ostream &output, LoraModule const &module)
-
namespace runtime
medusaModule.h
-
namespace tensorrt_llm
-
namespace runtime
-
class MedusaModule : public tensorrt_llm::runtime::SpeculativeDecodingModule
Public Types
-
using MedusaChoices = std::vector<std::vector<SizeType32>>
Public Functions
-
inline explicit MedusaModule(SizeType32 maxAcceptedTokens, SizeType32 maxDraftTokens) noexcept
-
inline explicit MedusaModule() noexcept
-
inline MedusaChoices const &getMedusaChoices() const noexcept
Private Members
-
MedusaChoices mDefaultMedusaChoices = {{0}, {0, 0}, {1}, {0, 1}, {2}, {0, 0, 0}, {1, 0}, {0, 2}, {3}, {0, 3}, {4}, {0, 4}, {2, 0}, {0, 5}, {0, 0, 1}, {5}, {0, 6}, {6}, {0, 7}, {0, 1, 0}, {1, 1}, {7}, {0, 8}, {0, 0, 2}, {3, 0}, {0, 9}, {8}, {9}, {1, 0, 0}, {0, 2, 0}, {1, 2}, {0, 0, 3}, {4, 0}, {2, 1}, {0, 0, 4}, {0, 0, 5}, {0, 0, 0, 0}, {0, 1, 1}, {0, 0, 6}, {0, 3, 0}, {5, 0}, {1, 3}, {0, 0, 7}, {0, 0, 8}, {0, 0, 9}, {6, 0}, {0, 4, 0}, {1, 4}, {7, 0}, {0, 1, 2}, {2, 0, 0}, {3, 1}, {2, 2}, {8, 0}, {0, 5, 0}, {1, 5}, {1, 0, 1}, {0, 2, 1}, {9, 0}, {0, 6, 0}, {0, 0, 0, 1}, {1, 6}, {0, 7, 0}}
-
using MedusaChoices = std::vector<std::vector<SizeType32>>
-
class MedusaModule : public tensorrt_llm::runtime::SpeculativeDecodingModule
-
namespace runtime
memoryCounters.h
-
namespace tensorrt_llm
-
namespace runtime
-
class MemoryCounters
-
Public Functions
-
MemoryCounters() = default
-
inline SizeType32 getGpu() const
-
inline SizeType32 getCpu() const
-
inline SizeType32 getPinned() const
-
inline SizeType32 getUVM() const
-
inline SizeType32 getPinnedPool() const
-
template<MemoryType T>
inline void allocate(SizeType32 size)
-
void allocate(MemoryType memoryType, SizeType32 size)
-
template<MemoryType T>
inline void deallocate(SizeType32 size)
-
void deallocate(MemoryType memoryType, SizeType32 size)
-
std::string toString() const
Public Static Functions
-
static MemoryCounters &getInstance()
-
static std::string bytesToString(SizeType32 bytes, int precision = 2)
Private Members
-
std::atomic<SizeType32> mGpu = {}
-
std::atomic<SizeType32> mCpu = {}
-
std::atomic<SizeType32> mPinned = {}
-
std::atomic<SizeType32> mUVM = {}
-
std::atomic<SizeType32> mPinnedPool = {}
-
MemoryCounters() = default
-
class MemoryCounters
-
namespace runtime
modelConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class ModelConfig
Public Types
-
enum class ModelVariant : std::int32_t
Values:
-
enumerator kGpt
-
enumerator kChatGlm
-
enumerator kGlm
-
enumerator kMamba
-
enumerator kRecurrentGemma
-
enumerator kEncDec
-
enumerator kGpt
-
enum class LayerType : std::int32_t
Values:
-
enumerator kATTENTION
-
enumerator kRECURRENT
-
enumerator kLINEAR
-
enumerator kNOOP
-
enumerator kATTENTION
Public Functions
-
inline explicit ModelConfig(SizeType32 vocabSize, SizeType32 nbLayers, SizeType32 nbAttentionLayers, SizeType32 nbRnnLayers, SizeType32 nbHeads, SizeType32 hiddenSize, nvinfer1::DataType dtype)
-
inline SizeType32 constexpr getVocabSize() const noexcept
-
inline SizeType32 constexpr getVocabSizePadded(SizeType32 worldSize) const noexcept
-
inline SizeType32 countLocalLayers(LayerType layerType, SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
-
inline SizeType32 countLowerRankLayers(LayerType layerType, SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
-
inline SizeType32 getNbLayers(SizeType32 pipelineParallelism = 1) const
-
inline SizeType32 getNbAttentionLayers(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
-
inline SizeType32 getNbRnnLayers(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
-
inline SizeType32 constexpr getNbHeads() const noexcept
-
inline SizeType32 getNbKvHeads(SizeType32 layerIdx) const
-
inline void setNbKvHeads(SizeType32 nbKvHeads)
-
inline void setNbCrossKvHeads(SizeType32 nbKvHeads)
-
inline SizeType32 constexpr getHiddenSize() const noexcept
-
inline SizeType32 constexpr getEncoderHiddenSize() const noexcept
-
inline void constexpr setEncoderHiddenSize(SizeType32 encoderHiddenSize) noexcept
-
inline SizeType32 constexpr getSizePerHead() const noexcept
-
inline void constexpr setSizePerHead(SizeType32 sizePerHead) noexcept
-
inline bool constexpr useGptAttentionPlugin() const noexcept
-
inline void constexpr useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept
-
inline bool constexpr useMambaConv1dPlugin() const noexcept
-
inline void constexpr useMambaConv1dPlugin(bool useMambaConv1dPlugin) noexcept
-
inline bool constexpr usePackedInput() const noexcept
-
inline void constexpr usePackedInput(bool inputPacked) noexcept
-
inline bool constexpr usePagedState() const noexcept
-
inline void constexpr usePagedState(bool pagedState) noexcept
-
inline SizeType32 constexpr getTokensPerBlock() const noexcept
-
inline void constexpr setTokensPerBlock(SizeType32 TokensPerBlock) noexcept
-
inline common::QuantMode constexpr getQuantMode() const noexcept
-
inline void constexpr setQuantMode(common::QuantMode QuantMode) noexcept
-
inline bool constexpr supportsInflightBatching() const noexcept
-
inline SizeType32 constexpr getMaxBatchSize() const noexcept
-
inline void constexpr setMaxBatchSize(SizeType32 maxBatchSize) noexcept
-
inline SizeType32 constexpr getMaxBeamWidth() const noexcept
-
inline void constexpr setMaxBeamWidth(SizeType32 maxBeamWidth) noexcept
-
inline SizeType32 constexpr getMaxInputLen() const noexcept
-
inline void constexpr setMaxInputLen(SizeType32 maxInputLen) noexcept
-
inline SizeType32 constexpr getMaxSequenceLen() const noexcept
-
inline void constexpr setMaxSequenceLen(SizeType32 maxSequenceLen) noexcept
-
inline std::optional<SizeType32> constexpr getMaxNumTokens() const noexcept
-
inline void constexpr setMaxNumTokens(std::optional<SizeType32> maxNumTokens) noexcept
-
inline SizeType32 constexpr getMaxEncoderLen() const noexcept
-
inline void constexpr setMaxEncoderLen(SizeType32 maxEncoderLen) noexcept
-
inline bool constexpr usePromptTuning() const noexcept
-
inline bool constexpr useMrope() const noexcept
-
inline void constexpr setUseMrope(bool useMrope) noexcept
-
inline SizeType32 constexpr getMaxPositionEmbeddings() const noexcept
-
inline void constexpr setMaxPositionEmbeddings(SizeType32 maxPositionEmbeddings) noexcept
-
inline SizeType32 constexpr getRotaryEmbeddingDim() const noexcept
-
inline void constexpr setRotaryEmbeddingDim(SizeType32 rotaryEmbeddingDim) noexcept
-
inline SizeType32 constexpr getMaxPromptEmbeddingTableSize() const noexcept
-
inline void constexpr setMaxPromptEmbeddingTableSize(SizeType32 maxPromptEmbeddingTableSize) noexcept
-
inline bool constexpr computeContextLogits() const noexcept
-
inline void constexpr computeContextLogits(bool computeContextLogits) noexcept
-
inline bool constexpr computeGenerationLogits() const noexcept
-
inline void constexpr computeGenerationLogits(bool computeGenerationLogits) noexcept
-
inline ModelVariant getModelVariant() const
-
inline void setModelVariant(ModelVariant modelVariant)
-
inline SizeType32 getMaxDecodingDraftTokens() const
-
inline SizeType32 constexpr getMaxDecodingTokens() const noexcept
-
inline void constexpr setContextFMHA(bool contextFMHA) noexcept
-
inline bool constexpr getContextFMHA() const noexcept
-
inline void constexpr setPagedContextFMHA(bool pagedContextFMHA) noexcept
-
inline bool constexpr getPagedContextFMHA() const noexcept
-
inline void constexpr setPpReduceScatter(bool ppReduceScatter) noexcept
-
inline bool constexpr getPpReduceScatter() const noexcept
-
inline bool constexpr useLoraPlugin() const noexcept
-
inline void constexpr useLoraPlugin(bool useLoraPlugin) noexcept
-
inline std::vector<LoraModule> const &getLoraModules() const noexcept
-
inline void setLoraModules(std::vector<LoraModule> const &loraModules) noexcept
-
inline SizeType32 constexpr getMlpHiddenSize() const noexcept
-
inline void constexpr setMlpHiddenSize(SizeType32 mlpHiddenSize) noexcept
-
inline bool constexpr isKVCacheEnabled() const noexcept
-
inline bool constexpr isPagedKVCache() const noexcept
-
inline bool constexpr isContinuousKVCache() const noexcept
-
inline KVCacheType constexpr getKVCacheType() const noexcept
-
inline void constexpr setKVCacheType(KVCacheType kvCacheType) noexcept
-
inline bool constexpr useCrossAttention() const noexcept
-
inline void constexpr setUseCrossAttention(bool useCrossAttention) noexcept
-
inline bool constexpr usePositionEmbedding() const noexcept
-
inline void constexpr setUsePositionEmbedding(bool usePositionEmbedding) noexcept
-
inline bool constexpr useTokenTypeEmbedding() const noexcept
-
inline void constexpr setUseTokenTypeEmbedding(bool useTokenTypeEmbedding) noexcept
-
inline SizeType32 constexpr getMaxLoraRank() const noexcept
-
inline void constexpr setMaxLoraRank(SizeType32 maxLoraRank) noexcept
-
inline void setSpeculativeDecodingMode(SpeculativeDecodingMode mode) noexcept
-
inline bool hasSpeculativeDecodingModule() const noexcept
-
inline SpeculativeDecodingModule const &getSpeculativeDecodingModule() const noexcept
-
inline std::shared_ptr<SpeculativeDecodingModule const> getSpeculativeDecodingModulePtr() const noexcept
-
inline std::shared_ptr<SpeculativeDecodingModule> getSpeculativeDecodingModulePtr() noexcept
-
inline bool constexpr isTransformerBased() const noexcept
-
inline bool hasRnnConfig() const noexcept
-
inline bool constexpr isRnnBased() const noexcept
-
inline SpeculativeDecodingMode constexpr getSpeculativeDecodingMode() const noexcept
-
inline void setUseShapeInference(bool useShapeInference) noexcept
-
inline bool useShapeInference() const noexcept
-
inline ManageWeightsType getManageWeightsType() const noexcept
-
inline void setManageWeightsType(const ManageWeightsType manageWeightType) noexcept
-
inline std::string const &getModelName() const noexcept
-
inline void setModelName(std::string const &modelName)
-
inline std::vector<SizeType32> const &getNumKvHeadsPerLayer() const
-
inline std::pair<std::vector<SizeType32>::const_iterator, std::vector<SizeType32>::const_iterator> getNumKvHeadsPerLayerLocalRange(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0, bool isCrossAttention = false) const
-
inline void setNumKvHeadsPerLayer(std::vector<SizeType32> const &headsPerLayer)
-
inline void setNumKvHeadsPerCrossLayer(std::vector<SizeType32> const &headsPerLayer)
-
inline SizeType32 getSumLocalKvHeads(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0, bool isCrossAttention = false) const
-
inline bool constexpr skipCrossAttnBlocks() const noexcept
-
inline void constexpr setSkipCrossAttnBlocks(bool skipCrossAttnBlocks) noexcept
Public Static Functions
-
static inline KVCacheType KVCacheTypeFromString(std::string value)
-
static inline std::vector<SizeType32> getOptProfilesSplitPoints() noexcept
Public Static Attributes
-
static constexpr std::array kOPT_PROFILES_SPLIT_POINTS = {64, 128, 256, 512, 1024}
-
static constexpr SizeType32 kDEFAULT_NUM_TOKENS_PER_BLOCK = 64
Private Members
-
SizeType32 mVocabSize
-
SizeType32 mNbLayers
-
SizeType32 mNbAttentionLayers
-
SizeType32 mNbRnnLayers
-
SizeType32 mNbHeads
-
SizeType32 mHiddenSize
-
SizeType32 mSizePerHead
-
bool mUseGptAttentionPlugin
-
bool mUseMambaConv1dPlugin
-
bool mInputPacked
-
bool mPagedState
-
SizeType32 mTokensPerBlock
-
common::QuantMode mQuantMode
-
SizeType32 mMaxBatchSize
-
SizeType32 mMaxBeamWidth
-
SizeType32 mMaxInputLen
-
SizeType32 mMaxSequenceLen
-
std::optional<SizeType32> mMaxNumTokens
-
bool mComputeContextLogits
-
bool mComputeGenerationLogits
-
ModelVariant mModelVariant
-
SizeType32 mMaxPromptEmbeddingTableSize
-
bool mUseMrope
-
SizeType32 mMaxPositionEmbeddings
-
SizeType32 mRotaryEmbeddingDim
-
bool mContextFMHA
-
bool mPagedContextFMHA
-
bool mUseXQA
-
bool mPpReduceScatter
-
bool mUseLoraPlugin
-
std::vector<LoraModule> mLoraModules
-
SizeType32 mMlpHiddenSize
-
SizeType32 mMaxLoraRank
-
KVCacheType mKVCacheType = KVCacheType::kCONTINUOUS
-
SizeType32 mMaxEncoderLen = {}
-
SizeType32 mEncoderHiddenSize = {}
-
bool mUseCrossAttention
-
bool mUsePositionEmbedding
-
bool mUseTokenTypeEmbedding
-
std::shared_ptr<SpeculativeDecodingModule> mSpeculativeDecodingModule
-
SpeculativeDecodingMode mSpeculativeDecodingMode
-
bool mUseShapeInference
-
ManageWeightsType mManageWeightsType
-
std::string mModelName
-
std::vector<SizeType32> mNumKvHeadsPerAttentionLayer
-
std::vector<SizeType32> mNumKvHeadsPerCrossAttentionLayer
-
bool mSkipCrossAttnBlocks
-
struct RnnConfig
Public Members
-
SizeType32 stateSize = 0
-
SizeType32 convKernel = 0
-
SizeType32 rnnHiddenSize = 0
-
SizeType32 rnnHeadSize = 0
-
SizeType32 rnnConvDimSize = 0
-
SizeType32 stateSize = 0
-
enum class ModelVariant : std::int32_t
-
class ModelConfig
-
namespace runtime
promptTuningParams.h
-
namespace tensorrt_llm
-
namespace runtime
-
template<typename TTensor>
class GenericPromptTuningParams -
Public Functions
-
class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<ITensor::SharedPtr>
Public Types
-
using SizeType32 = GenericPromptTuningParams::SizeType32
Public Functions
-
inline explicit PromptTuningParams(TensorPtr embeddingTable = nullptr, TensorPtr tasks = nullptr, TensorPtr vocabSize = nullptr)
-
void fillTasksTensor(TensorPtr tasksHost, const SizeType32 batchSize, const SizeType32 numContextRequests, std::vector<SizeType32> const &reqBeamWidths, std::vector<SizeType32> const &reqPromptLengths, BufferManager const &manager, bool packedInput)
-
using SizeType32 = GenericPromptTuningParams::SizeType32
-
template<typename TTensor>
-
namespace runtime
rawEngine.h
-
namespace tensorrt_llm
-
namespace runtime
-
class RawEngine
Public Types
Public Functions
-
inline explicit RawEngine(std::filesystem::path enginePath) noexcept
-
inline explicit RawEngine(void const *engineAddr, std::size_t engineSize) noexcept
-
inline std::filesystem::path getPath() const
-
inline std::optional<std::filesystem::path> getPathOpt() const
-
inline void setPath(std::filesystem::path enginePath)
-
inline std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> const &getManagedWeightsMapOpt() const
-
inline void setManagedWeightsMap(std::map<std::string, tensorrt_llm::executor::Tensor> managedWeightsMap)
-
inline void const *getAddress() const
-
inline std::size_t getSize() const
-
inline explicit RawEngine(std::filesystem::path enginePath) noexcept
-
class RawEngine
-
namespace runtime
request.h
-
namespace tensorrt_llm
-
namespace runtime
-
namespace decoder_batch
-
class Request
Public Types
-
using TensorConstPtr = ITensor::SharedConstPtr
Public Functions
-
inline explicit Request(TensorConstPtr ids, SizeType32 inputLen, std::optional<SizeType32> maxNewTokens = std::nullopt, std::optional<SizeType32> endId = std::nullopt)
Public Members
-
TensorConstPtr ids
-
SizeType32 inputLen
-
std::optional<SizeType32> maxNewTokens
-
std::optional<SizeType32> endId
-
SizeType32 generatedTokensPerEngineStep
-
std::optional<executor::LookaheadDecodingConfig> lookaheadRuntimeConfig
-
std::optional<executor::EagleConfig> eagleConfig
-
using TensorConstPtr = ITensor::SharedConstPtr
-
class Request
-
namespace decoder_batch
-
namespace runtime
runtimeDefaults.h
-
namespace tensorrt_llm
-
namespace runtime
-
struct RuntimeDefaults
Public Functions
-
inline RuntimeDefaults(std::optional<std::vector<SizeType32>> maxAttentionWindowVec, std::optional<SizeType32> sinkTokenLength)
-
RuntimeDefaults() = default
Public Members
-
std::optional<std::vector<SizeType32>> maxAttentionWindowVec
-
std::optional<SizeType32> sinkTokenLength
-
inline RuntimeDefaults(std::optional<std::vector<SizeType32>> maxAttentionWindowVec, std::optional<SizeType32> sinkTokenLength)
-
struct RuntimeDefaults
-
namespace runtime
samplingConfig.h
Defines
-
SET_FROM_OPTIONAL(varName, VarName, VarType)
-
namespace tensorrt_llm
-
namespace runtime
-
class SamplingConfig
Public Functions
-
inline explicit SamplingConfig(SizeType32 beamWidth = 1)
-
inline explicit SamplingConfig(std::vector<SamplingConfig> const &configs)
-
inline explicit SamplingConfig(executor::SamplingConfig const &samplingConfig, std::optional<executor::ExternalDraftTokensConfig> const &externalDraftTokensConfig)
-
inline bool validate()
-
inline bool operator==(SamplingConfig const &other) const
-
inline SizeType32 getNumReturnBeams() const
Public Members
-
SizeType32 beamWidth
-
std::optional<SizeType32> numReturnSequences
-
OptVec<SizeType32> minLength
-
OptVec<SizeType32> noRepeatNgramSize
-
OptVec<SizeType32> topK
-
OptVec<TokenIdType> topPResetIds
-
OptVec<SizeType32> earlyStopping
-
OptVec<std::vector<runtime::SizeType32>> topKMedusaHeads
-
std::optional<bool> normalizeLogProbs
Private Types
-
using FloatType = float
Private Functions
-
inline explicit SamplingConfig(SizeType32 beamWidth = 1)
-
class SamplingConfig
-
namespace runtime
speculativeDecodingMode.h
-
namespace tensorrt_llm
-
namespace runtime
-
class SpeculativeDecodingMode
Public Types
-
using UnderlyingType = std::uint8_t
Public Functions
-
inline bool constexpr isNone() const
-
inline bool constexpr isDraftTokensExternal() const
-
inline bool constexpr isMedusa() const
-
inline bool constexpr isLookaheadDecoding() const
-
inline bool constexpr isExplicitDraftTokens() const
-
inline bool constexpr isEagle() const
-
inline bool constexpr updatesPositionIds() const
-
inline bool constexpr requiresAttentionMask() const
-
inline bool constexpr predictsDraftTokens() const
-
inline bool constexpr needsKVCacheRewind() const
-
inline bool constexpr variableDraftLength() const
-
inline bool constexpr hasDraftLogits() const
-
inline bool constexpr needsDecoderPrologue() const
-
inline bool operator==(SpeculativeDecodingMode const &other) const
-
inline explicit constexpr SpeculativeDecodingMode(UnderlyingType state)
Public Static Functions
-
static inline auto constexpr None()
-
static inline auto constexpr DraftTokensExternal()
-
static inline auto constexpr Medusa()
-
static inline auto constexpr LookaheadDecoding()
-
static inline auto constexpr ExplicitDraftTokens()
-
static inline auto constexpr Eagle()
Private Functions
-
inline bool constexpr anyBitSet(UnderlyingType bits) const
-
inline bool constexpr allBitSet(UnderlyingType bits) const
Private Members
-
UnderlyingType mState = {kNone}
Private Static Attributes
-
static UnderlyingType constexpr kNone = {1U << 0U}
-
static UnderlyingType constexpr kDraftTokensExternal = {1U << 1U}
-
static UnderlyingType constexpr kMedusa = {1U << 2U}
-
static UnderlyingType constexpr kLookaheadDecoding = {1U << 3U}
-
static UnderlyingType constexpr kExplicitDraftTokens = {1U << 4U}
-
static UnderlyingType constexpr kEagle = {1U << 5U}
-
using UnderlyingType = std::uint8_t
-
class SpeculativeDecodingMode
-
namespace runtime
speculativeDecodingModule.h
-
namespace tensorrt_llm
-
namespace runtime
-
class SpeculativeDecodingModule
Subclassed by tensorrt_llm::runtime::LookaheadModule, tensorrt_llm::runtime::MedusaModule
Public Functions
-
inline explicit SpeculativeDecodingModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens, SizeType32 maxNumPaths) noexcept
-
inline explicit SpeculativeDecodingModule() noexcept
-
virtual ~SpeculativeDecodingModule() = default
-
SpeculativeDecodingModule(SpeculativeDecodingModule const &o) = default
-
SpeculativeDecodingModule &operator=(SpeculativeDecodingModule const &o) = default
-
inline SizeType32 getMaxDraftPathLen() const noexcept
- Returns:
max number of draft tokens that can be accepted by one step of the decoder
-
inline SizeType32 getMaxPathLen() const noexcept
one more than draft path len for prediction from primary head
- Returns:
max number of tokens that a request can grow in one step of the decoder
-
inline SizeType32 getMaxDecodingDraftTokens() const noexcept
- Returns:
max number of draft tokens processed by one step of the decoder
-
inline SizeType32 getMaxDecodingTokens() const noexcept
one more than decoding draft tokens for prediction from primary head
- Returns:
max number of tokens processed by one step of the decoder
-
inline SizeType32 getNumPackedMasks() const noexcept
-
inline SizeType32 getMaxNumPaths() const noexcept
-
inline void setMaxDraftTokens(SizeType32 maxDraftTokens) noexcept
-
inline void setMaxDraftPathLen(SizeType32 maxDraftPathLen) noexcept
-
inline void setMaxNumPaths(SizeType32 maxNumPaths) noexcept
Private Functions
-
inline void computeNumPackedMasks() noexcept
Private Members
-
SizeType32 mMaxDraftPathLen
-
SizeType32 mMaxDecodingDraftTokens
-
SizeType32 mMaxNumPaths
-
SizeType32 mMaxNumPackedMasks
-
inline explicit SpeculativeDecodingModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens, SizeType32 maxNumPaths) noexcept
-
class SpeculativeDecodingModule
-
namespace runtime
tllmLogger.h
worldConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class WorldConfig
Public Functions
-
explicit WorldConfig(SizeType32 tensorParallelism = 1, SizeType32 pipelineParallelism = 1, SizeType32 contextParallelism = 1, SizeType32 rank = 0, SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)
-
inline SizeType32 constexpr getSize() const noexcept
-
inline SizeType32 constexpr getTensorParallelism() const noexcept
-
inline bool constexpr isTensorParallel() const noexcept
-
inline SizeType32 constexpr getPipelineParallelism() const noexcept
-
inline bool constexpr isPipelineParallel() const noexcept
-
inline SizeType32 constexpr getContextParallelism() const noexcept
-
inline bool constexpr isContextParallel() const noexcept
-
inline SizeType32 constexpr getRank() const noexcept
-
inline SizeType32 constexpr getGpusPerNode() const noexcept
-
inline SizeType32 getGpusPerGroup() const noexcept
-
inline SizeType32 getDevice() const noexcept
-
inline SizeType32 getDeviceOf(SizeType32 rank) const noexcept
-
inline SizeType32 constexpr getPipelineParallelRank() const noexcept
-
inline SizeType32 constexpr getTensorParallelRank() const noexcept
-
inline SizeType32 constexpr getContextParallelRank() const noexcept
-
inline SizeType32 constexpr getLocalRank() const noexcept
-
inline SizeType32 constexpr getNodeRank() const noexcept
-
inline SizeType32 constexpr getNodeRankOf(SizeType32 rank) const noexcept
-
inline bool constexpr isFirstPipelineParallelRank() const noexcept
-
inline bool constexpr isLastPipelineParallelRank() const noexcept
Is my rank the last rank in its pipeline?
-
inline bool constexpr isFirstTensorParallelRank() const noexcept
-
inline bool constexpr isFirstContextParallelRank() const noexcept
-
inline SizeType32 constexpr getLastRank() const noexcept
-
std::vector<SizeType32> getPipelineParallelGroup() const
-
std::vector<SizeType32> getTensorParallelGroup() const
-
std::vector<SizeType32> getContextParallelGroup() const
-
bool validMpiConfig() const
Public Static Functions
-
static WorldConfig mpi(SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType32> tensorParallelism = std::nullopt, std::optional<SizeType32> pipelineParallelism = std::nullopt, std::optional<SizeType32> contextParallelism = std::nullopt, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)
Public Static Attributes
-
static SizeType32 constexpr kDefaultGpusPerNode = 1
Private Members
-
SizeType32 mTensorParallelism
-
SizeType32 mPipelineParallelism
-
SizeType32 mContextParallelism
-
SizeType32 mRank
-
SizeType32 mGpusPerNode
-
std::vector<SizeType32> mDeviceIds
-
explicit WorldConfig(SizeType32 tensorParallelism = 1, SizeType32 pipelineParallelism = 1, SizeType32 contextParallelism = 1, SizeType32 rank = 0, SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)
-
class WorldConfig
-
namespace runtime