Runtime
bufferManager.h
-
namespace tensorrt_llm
-
namespace runtime
-
class BufferManager
- #include <bufferManager.h>
A helper class for managing memory on host and device.
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
using CudaMemPoolPtr = std::shared_ptr<CudaMemPool>
Public Functions
-
explicit BufferManager(CudaStreamPtr stream, bool trimPool = false)
Construct a BufferManager.
- Parameters:
cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
-
inline ~BufferManager()
Destructor.
-
IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
IBuffer
of the given size on the GPU, using cudaMallocAsync.
-
ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
ITensor
of the given dimensions on the GPU, using cudaMallocAsync.
-
IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
IBuffer
of the given size and memory type.
-
ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
ITensor
of the given dimensions and memory type.
-
inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const
Create an empty
IBuffer
of the given memory type. It may be resized later.
-
inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const
Create an empty
ITensor
of the given memory type. It may be reshaped later.
-
void copy(void const *src, IBuffer &dst, MemoryType srcType) const
Copy
src
todst
.
-
void copy(IBuffer const &src, void *dst, MemoryType dstType) const
Copy
src
todst
.
-
IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const
Copy
src
into a newIBuffer
with a potentially different memory type.
-
ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const
Copy
src
into a newITensor
with a potentially different memory type.
-
template<typename T>
inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const Copy
src
into a newIBuffer
with a potentially different memory type.
-
template<typename T>
inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const Copy
src
into a newITensor
with a potentially different memory type.
-
template<typename T>
inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const Copy
src
into a newITensor
with a potentially different memory type.
-
CudaStream const &getStream() const
Get the underlying cuda stream.
-
std::size_t memoryPoolReserved() const
The current size of the memory reserved by the memory pool.
-
std::size_t memoryPoolUsed() const
The current size of the memory used by the memory pool.
-
std::size_t memoryPoolFree() const
The current size of the memory free in the memory pool.
-
void memoryPoolTrimTo(std::size_t size)
Try to trim the memory reserved by the pool to
size
bytes. This synchronizes implicitly with the stream.
Public Static Functions
-
static IBufferPtr gpuSync(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
IBuffer
of the given size on the GPU, using cudaMalloc.
-
static ITensorPtr gpuSync(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
ITensor
of the given dimensions on the GPU, using cudaMalloc.
-
static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
IBuffer
of the given size on the CPU.
-
static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
ITensor
of the given dimensions on the CPU.
-
static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
IBuffer
of the given size on the CPU.
-
static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
ITensor
of the given dimensions on the CPU.
-
static IBufferPtr pinnedPool(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
IBuffer
of the given size on the CPU in the default memory pool.
-
static ITensorPtr pinnedPool(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
ITensor
of the given dimensions on the CPU in the default memory pool.
-
static IBufferPtr managed(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
IBuffer
of the given size in UVM.
-
static ITensorPtr managed(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
ITensor
of the given dimensions in UVM.
Friends
- friend class ::BufferManagerTest
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
class BufferManager
-
namespace runtime
common.h
Defines
-
FMT_DIM
-
namespace tensorrt_llm
-
namespace runtime
Typedefs
-
using SizeType32 = std::int32_t
-
using TokenIdType = std::int32_t
-
using LoraTaskIdType = std::uint64_t
-
using TokenExtraIdType = std::uint64_t
-
using VecTokenExtraIds = std::vector<TokenExtraIdType>
-
using VecUniqueTokens = std::vector<UniqueToken>
-
struct UniqueToken
Public Functions
-
inline bool operator==(UniqueToken const &other) const noexcept
-
inline bool operator==(UniqueToken const &other) const noexcept
-
using SizeType32 = std::int32_t
-
namespace runtime
cudaEvent.h
-
namespace tensorrt_llm
-
namespace runtime
-
class CudaEvent
Public Types
-
using pointer = cudaEvent_t
Public Functions
-
inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)
Creates a new cuda event. The event will be destroyed in the destructor.
- Parameters:
flags – Flags for event creation. By default, event timing is disabled.
-
inline explicit CudaEvent(pointer event, bool ownsEvent = true)
Pass an existing cuda event to this object.
- Parameters:
event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.
-
inline void synchronize() const
Synchronizes the event.
Private Types
-
using EventPtr = std::unique_ptr<element_type, Deleter>
-
using pointer = cudaEvent_t
-
class CudaEvent
-
namespace runtime
cudaStream.h
-
namespace tensorrt_llm
-
namespace runtime
-
class CudaStream
Public Functions
-
inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)
Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
- Parameters:
flags – Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See ::cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
-
inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)
Pass an existing cuda stream to this object.
- Parameters:
stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.
-
inline explicit CudaStream(cudaStream_t stream)
Construct with an existing cuda stream or the default stream by passing nullptr.
-
inline int getDevice() const
Returns the device on which the stream was created.
-
inline cudaStream_t get() const
Returns the stream associated with this object.
-
inline void synchronize() const
Synchronizes the stream.
-
inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)
-
class CudaStream
-
namespace runtime
decodingInput.h
-
namespace tensorrt_llm
-
namespace runtime
-
class DecodingInput
- #include <decodingInput.h>
Represents the inputs to the decoder.
This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such.
Public Functions
-
inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize, TensorConstPtr logits, TensorPtr endIds, TensorConstPtr batchSlots)
Public Members
-
SizeType32 step
The index of the decoding step we are on. Only used in Python runtime.
-
SizeType32 maxLength
The maximum number of tokens to decode.
-
SizeType32 maxAttentionWindow
The maximum length of the attention window to consider while decoding.
-
SizeType32 sinkTokenLength
the number of tokens to use as attention sinks, as described there: https://arxiv.org/html/2309.17453v3
-
SizeType32 batchSize
The number of samples in the batch.
-
SizeType32 maxStopWordsLen
The maximum value in the
stopWordsLens
tensor.
-
SizeType32 maxBadWordsLen
The maximum value in the
badWordsLens
tensor.
-
TensorConstPtr logits
[batchSize, beamWidth, vocabSizePadded], on gpu. Logits are are a probability distribution over the vocabulary, the output of the model.
-
TensorConstPtr endIds
[batchSize * beamWidth], on gpu
-
TensorConstPtr batchSlots
[batchSize], address map of the linear batch id to to the seq slots, int32_t, pinned
-
TensorConstPtr finishReasons
[batchSize, beamWidth], finished states at current iteration. If true for some request, the decoding step of it is skipped, on gpu
-
TensorConstPtr sequenceLimitLength
[batchSize], on gpu. The maximum sequence length for each sequence in the batch.
-
TensorConstPtr embeddingBias
[batchSize, vocabSizePadded], on gpu
-
TensorConstPtr lengths
[batchSize, beamWidth], on gpu
-
TensorConstPtr badWordsPtrs
[batchSize][2, badWordsLength], on gpu
-
TensorConstPtr badWordsLens
[batchSize], on gpu
-
TensorConstPtr stopWordsPtrs
[batchSize][2, stopWordsLength], pinned
-
TensorConstPtr stopWordsLens
[batchSize], pinned
-
TensorConstPtr noRepeatNgramSize
[batchSize], on gpu
-
TensorPtr cacheIndirection
[batchSize, beamWidth, maxSeqLen] - the k/v cache index for beam search, on gpu
-
std::optional<MedusaInputs> medusaInputs
-
std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs
-
std::optional<LookaheadInputs> lookaheadInputs
-
std::optional<ExternalDraftTokensInputs> externalDraftTokensInputs
-
class ExplicitDraftTokensInputs
Public Members
-
TensorConstPtr nextDraftTokens
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr nextFlatTokens
[batchSize * maxDecodingTokens]
-
TensorConstPtr nextDraftIndices
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr nextDraftProbs
[batchSize, maxNumPaths, maxDraftPathLen, vocabSize]
-
TensorConstPtr lastDraftTokens
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr lastDraftIndices
[batchSize, maxNumPaths, maxPathLen]
-
TensorConstPtr masks
[batchSize, maxDecodingTokens, maxDecodingTokens], bool
-
TensorConstPtr packedPositionIds
[batchSize * maxDecodingTokens]
-
TensorConstPtr bestPathLengths
[batchSize]
-
TensorConstPtr bestPathIndices
[batchSize]
-
TensorConstPtr nextGenerationLengths
[batchSize]
-
TensorConstPtr lastPositionIdsBase
[batchSize]
-
TensorConstPtr lastGenerationLengths
[batchSize]
-
TensorConstPtr maxGenLengthDevice
[1]
-
TensorConstPtr seqSlots
[batchSize]
-
TensorConstPtr nextDraftTokens
-
class ExternalDraftTokensInputs
-
struct LookaheadInputs
-
class MedusaInputs
Public Members
-
TensorConstPtr medusaPaths
[batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu
-
TensorConstPtr medusaTreeIds
[batchSize, maxTokensPerStep], on gpu
-
std::vector<std::vector<TensorPtr>> medusaLogits
[batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu
-
TensorConstPtr medusaTargetTokensPerStep
[batchSize], on gpu
-
TensorConstPtr medusaPaths
-
inline DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize, TensorConstPtr logits, TensorPtr endIds, TensorConstPtr batchSlots)
-
class DecodingInput
-
namespace runtime
decodingOutput.h
-
namespace tensorrt_llm
-
namespace batch_manager
-
namespace runtime
-
class DecodingOutput
-
Public Members
-
BeamHypotheses beamHypotheses
-
std::optional<SpeculativeDecodingOutputs> speculativeDecodingOutputs
-
std::optional<ExplicitDraftTokensBuffers::Inputs> explicitDraftTokensBuffers
-
std::optional<LookaheadDecodingBuffers> lookaheadOutputs
Public Static Attributes
-
static constexpr float kNegativeInfinity = -1e20f
-
class BeamHypotheses
Public Functions
-
void empty(BufferManager &manager)
-
void reshape(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSequenceLength)
-
void release()
-
void init(BufferManager &manager, TokenIdType endId)
-
BeamHypotheses slice(SizeType32 batchIndex, SizeType32 size) const
-
void empty(BufferManager &manager)
-
class SpeculativeDecodingOutputs
-
BeamHypotheses beamHypotheses
-
class DecodingOutput
-
namespace batch_manager
explicitDraftTokensBuffers.h
-
namespace tensorrt_llm
-
namespace runtime
-
class ExplicitDraftTokensBuffers
Public Types
-
using SizeType32 = runtime::SizeType32
-
using TensorMap = runtime::StringPtrMap<runtime::ITensor>
Public Functions
-
ExplicitDraftTokensBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, executor::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)
-
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const &modelConfig)
-
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const &requestTypes, ITensor const &seqSlots, ExplicitDraftTokensBuffers::Inputs const &decoderBuffers, ITensor const &contextPositionIds, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const
-
void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const
Public Members
-
tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs engineInputs
-
class tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs engineOutputs
-
std::size_t scanTempStorageBytes = {0}
Private Functions
-
template<typename T>
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 vocabSizePadded, ITensor const &seqSlots, ExplicitDraftTokensBuffers::Inputs const &draftBuffers, ITensor const &contextPositionIds, runtime::ExplicitDraftTokensModule const &explicitDraftTokensModule, runtime::CudaStream const &stream) const
-
class EngineInputs : public tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs
-
class EngineOutputs
Public Members
-
class Inputs
Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs
Public Functions
-
void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)
Public Members
-
TensorPtr randomDataValidation
[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]
-
TensorPtr draftTokens
[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
-
TensorPtr draftIndices
[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
-
TensorPtr draftProbs
[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]
-
void create(SizeType32 maxNumSequences, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig)
-
using SizeType32 = runtime::SizeType32
-
class ExplicitDraftTokensBuffers
-
namespace runtime
generationInput.h
-
namespace tensorrt_llm
-
namespace runtime
-
template<typename TTensor, typename PromptTuningParams>
class GenericGenerationInput - #include <generationInput.h>
endId
, is the token ID that marks the end of the input sequence (akaEOS
or end-of-sequence). It’s50,256
for the GPT2 model which has a vocabulary of50,257
tokens, for example,padId
, is the token ID that is used for padding (i.e. fills in the slots that are at an index greater-or-equal to the input length for padded sequences). It can be set to the same value asendId
,ids
, is the tensor of input IDs. That tensor must be allocated on the GPU. When the input tensor is padded, the shape ofids
is[batchSize, maxInputLength]
, wherebatchSize
andmaxInputLength
must respect the maximum sizes insessionConfig
passed to theGptSession
constructor. When the input is packed, the shape ofids
is[numTokens]
, wherenumTokens
is the sum of the lengths of the different sequences in the batch,lengths
, is the tensor of input sequence lengths. That tensor must be allocated on the GPU and containbatchSize
values,packed
, indicates if theids
tensor is packed or padded. In this release, that flag must match the value passed to the constructor through the instance of theModelConfig
class. In a future release, the session may be made more flexible and automatically pad or pack the input,
embeddingBiasOpt
, is a tensor of floating-point values on the GPU that contains the bias to add to the logits during sampling (after the projection from hidden states to logits as the last step of the model). This tensor must havevocabSize
elements (as defined in themodelConfig
argument passed to the constructor),badWordsList
, is a tensor of integers on the GPU that encodes the list of words that have to be banned from generated sequences. Its shape is[2, badWordsLength]
, as explained below, or[batchSize, 2, badWordsLength]
when there is a different list for each sequence in the batch,stopWordsList
, is a tensor of integers on the GPU that encodes the list of words that trigger the end of the generation for a sequence. Its shape is[2, stopWordsLength]
, as explained below, or[batchSize, 2, stopWordsLength]
when there is a different list for each sequence in the batch,maxNewTokens
, is the maximum number of tokens to generate.
The
badWordsList
andstopWordsList
tensors have the same shape[2, length]
. Let’s consider an example with three words to describe the representation of those lists. The first word contains tokens[5, 7, 3]
, the second one contains[9, 2]
and the third one is composed of tokens[6, 2, 4, 1]
. In total, there are 9 tokens. That’s the length. The shape of the tensor is[2, 9]
. The first row of the tensor must contain the 9 token IDs and the second row must store the inclusive prefix-sum of the word lengths as shown on the following diagram:0 3 5 9 | | | | V V V V [ 5, 7, 3, 9, 2, 6, 2, 4, 1] [ 3, 5, 9, -1, -1, -1, -1, -1, -1]
In case all the words are made of a single token, the inner-most dimension of the tensor must be increased by 1 (i.e. the length for 4 words, each made of a single token, must be 5 instead of 4 — the shape is
[2, 5]
).Public Functions
-
inline explicit GenericGenerationInput(SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)
Public Members
-
SizeType32 endId
-
SizeType32 padId
-
bool packed
-
std::optional<SizeType32> maxNewTokens
-
PromptTuningParams promptTuningParams
-
class GenerationInput : public tensorrt_llm::runtime::GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
Public Types
-
using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
Public Functions
-
inline explicit GenerationInput(SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)
-
using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
-
template<typename TTensor, typename PromptTuningParams>
-
namespace runtime
generationOutput.h
-
namespace tensorrt_llm
-
namespace runtime
-
template<typename TTensor>
class GenericGenerationOutput - #include <generationOutput.h>
ids
, is a tensor that contains the output token IDs. Its shape is[batchSize, beamWidth, maxSeqLength]
wheremaxSeqLength
is the sum ofmaxInputLength
andmaxNewTokens
. After generation, it contains, for each sequence, a copy of the input tokens followed by the output tokens. When a sequence is shorter thanmaxSeqLength
, padding tokens are added at the end of the sequence.
Note that the shape of that tensor is different in this version of TensorRT-LLM from its shape in previous versions where it was .
logProbs
, is a tensor of floating-point values on the GPU to store the log-prob of the generated tokens. Its shape is[maxNewTokens, batchSize, beamWidth]
. Its shape will likely change in a future release to match the shape of the outputids
tensor.contextLogits
, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the context. Its shape is[batchSize, maxSequenceLength, vocabSizePadded]
. If useremove_input_padding
, its shape is[packedSize, vocabSizePadded]
. This buffer will only be filled in if the TensorRT engine was built with thegather_context_logits
orgather_all_token_logits
parameter enabled.After inference is complete, you can get the context logits in
GenerationOutput.contextLogits
, these are variables on the GPU. For specific acquisition methods, please refer to the example of gptSessionBenchmark.cpp.It is important to point out that enabling the computation may have an impact on performance (the language modeling head (LM head) has to perform a matrix multiplication on all the context tokens instead of a just the last one).
generationLogits
, is a tensor of values on the GPU (same datatype as the computation type) to store the logits for the generation. Its shape is[batchSize, beamWidth, maxOutputLen, vocabSizePadded]
. This buffer will only be filled in if the TensorRT engine was built with thegather_generation_logits
orgather_all_token_logits
parameter enabled.Generation logits can also be obtained through
GenerationOutput.generationLogits
after inference is completed.onTokenGenerated
, is a callback function invoked in the generation loop to pass newly generated tokens to the caller while the loop continues to execute. An implementation of that callback must accept the outputids
tensor, the generationstep
and a boolean flag that indicates if the generation is complete.
Public Types
-
using Callback = std::function<void(TensorPtr const &ids, SizeType32 step, bool finished)>
-
class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<ITensor::SharedPtr>
Public Types
-
using Base = GenericGenerationOutput<ITensor::SharedPtr>
-
using Base = GenericGenerationOutput<ITensor::SharedPtr>
-
template<typename TTensor>
-
namespace runtime
gptDecoder.h
-
namespace tensorrt_llm
-
namespace layers
-
namespace runtime
Functions
-
inline runtime::ITensor::SharedConstPtr getDefaultBatchSlots(runtime::SizeType32 batchSize, runtime::BufferManager const &bufferManager)
Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.
-
class IGptDecoder
Subclassed by tensorrt_llm::runtime::GptDecoder< T >
Public Types
-
using TensorConstPtr = runtime::ITensor::SharedConstPtr
Public Functions
-
virtual ~IGptDecoder() = default
-
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, TensorConstPtr const &batchSlots, std::optional<DecodingOutput> const &output = std::nullopt, std::optional<std::vector<decoder_batch::Request> const> const &requests = std::nullopt) = 0
-
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) = 0
-
virtual void forwardSync(DecodingOutput &output, DecodingInput const &input) = 0
-
virtual SamplingConfig const &getSamplingConfig() = 0
Public Static Functions
-
using TensorConstPtr = runtime::ITensor::SharedConstPtr
-
template<typename T>
class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder Public Types
-
using CudaStreamPtr = BufferManager::CudaStreamPtr
Public Functions
-
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, TensorConstPtr const &batchSlots, std::optional<DecodingOutput> const &output = std::nullopt, std::optional<std::vector<decoder_batch::Request> const> const &requests = std::nullopt) override
-
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) override
-
virtual void forwardSync(DecodingOutput &output, DecodingInput const &input) override
-
inline virtual SamplingConfig const &getSamplingConfig() override
Private Members
-
std::shared_ptr<BufferManager> mManager
-
std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer
-
std::shared_ptr<tensorrt_llm::runtime::DecodingLayerWorkspace> mDecodingLayerWorkspace
-
SamplingConfig mSamplingConfig
-
size_t mMaxBatchSize
-
executor::DecodingMode mDecodingMode
-
using CudaStreamPtr = BufferManager::CudaStreamPtr
-
inline runtime::ITensor::SharedConstPtr getDefaultBatchSlots(runtime::SizeType32 batchSize, runtime::BufferManager const &bufferManager)
-
namespace layers
gptDecoderBatched.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched
- #include <gptDecoderBatched.h>
GPT decoder class with support for in-flight batching.
Public Functions
-
GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream, SpeculativeDecodingMode const &speculativeDecodingMode, nvinfer1::DataType dtype)
-
virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) override
Setup the decoder before calling
forward()
-
virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) override
Setup buffers for ExplicitDraftTokens decoding.
-
virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) override
Setup buffers for Lookahead decoding.
-
virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) override
Initialize the decoder with new batch of inputs.
-
virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs) override
Initialize batched decoder at seqSlots with a new
requests
.
-
virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override
Run one step for all requests without blocking the host process and return the token for synchronization.
-
virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent) override
Wait for the call to
forwardAsync
associated with a token to complete.
-
virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent, decoder_batch::Output &output, decoder_batch::Input const &input) override
Call decoder forwardSync and wait for the call to
forwardAsync
associated with a token to complete.
-
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override
Run one step for all requests without blocking the host thread.
-
virtual void forwardSync() override
Wait for the last call to
forwardAsync
to complete.
-
inline virtual std::vector<bool> getFinished() const override
- Returns:
[batchSize], indicators of finished requests
-
inline virtual TensorPtr getFinishReasons() const override
- Returns:
[batchSize, beamWidth], FinishedState value, on gpu
-
inline virtual TensorPtr getIds(SizeType32 batchIdx) const override
- Parameters:
batchIdx – index of the batch
- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
batchIdx
, on gpu. In case of beam search, contains the ungathered data.
-
inline virtual TensorPtr getIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data.
-
inline virtual TensorPtr getGatheredIds(SizeType32 batchIdx) const override
- Parameters:
batchIdx – index of the batch
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request
batchIdx
, on gpu.
-
inline virtual TensorPtr getGatheredIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu
-
virtual CudaEvent finalize(SizeType32 batchSlot, SamplingConfig const &samplingConfig, bool streaming) const override
Gather final beam search results for request
batchSlot
. Result will only be available after event returned.
-
virtual void finalize(SamplingConfig const &samplingConfig) const override
Gather final beam search results for all requests.
-
inline virtual TensorPtr getParentIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
-
inline virtual TensorPtr getCumLogProbs() const override
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
inline virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const override
- Returns:
[maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
inline virtual TensorPtr getLogProbs() const override
- Returns:
[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
inline virtual TensorPtr getLogProbs(SizeType32 batchIdx) const override
- Returns:
[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
inline virtual TensorPtr getAllNewTokens() const override
Get maxTokensPerStep tokens generated in the last forward pass.
- Returns:
[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
-
inline virtual TensorPtr getNewTokens(SizeType32 iter = 0) const override
Get tokens generated in one step of last forward pass.
- Parameters:
iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
- Returns:
[batchSize, beamWidth], tokens generated in
iter
(per beam), on gpu
-
inline virtual std::vector<SizeType32> getNbSteps() const override
- Returns:
[batchSize], the number of generation steps executed on each request
-
inline virtual TensorPtr getNbFinished() const override
- Returns:
[1], number of finished sequences, in pinned host memory
-
inline virtual TensorPtr getNextDraftTokens() const override
- Returns:
[batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu
-
inline virtual TensorPtr getPrevDraftTokensLengths() const override
- Returns:
[batchSize], predicted draft tokens lengths for previous step, on gpu
-
inline virtual TensorPtr getNextDraftTokensLengths() const override
- Returns:
[batchSize], predicted draft tokens lengths for next step, on gpu
-
inline virtual TensorPtr getAcceptedLengthsCumSum() const override
- Returns:
[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu
-
inline virtual TensorPtr getAcceptedPackedPaths() const override
- Returns:
[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
-
inline virtual executor::DecodingMode getDecodingMode() const override
Private Types
-
using GptDecoderPtr = std::unique_ptr<IGptDecoder>
-
using DecodingInputPtr = std::unique_ptr<DecodingInput>
-
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>
Private Functions
-
CudaEvent postProcessRequest(SizeType32 batchIdx, SamplingConfig const &samplingConfig, bool streaming) const
Gather final beam search results for request
batchIdx
.
-
void newRequest(SizeType32 batchSlot, decoder_batch::Request const &request, SamplingConfig const &samplingConfig)
Initialize the decoder at
batchSlot
with a newrequest
.
-
void allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype)
Allocate buffers for speculative decoding.
-
void setupSpeculativeDecoding(ModelConfig const &modelConfig)
Setup buffers for speculative decoding.
-
void setupLookahead(ModelConfig const &modelConfig)
Setup buffers for lookahead decoding.
-
void newRequestSpeculativeDecoding(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig)
Setups decoder internal tensors for new speculative decoding request.
-
void newRequestDraftTokensExternal(SizeType32 batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig)
Setups decoder internal tensors for new request in Draft model Sps mode.
-
void newRequestMedusa(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Medusa request.
-
void newRequestLookahead(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Lookahead request.
-
void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::Request const &request)
Setups decoder internal tensors for new Explicit draft tokens request.
-
void updateFinished(decoder_batch::DecoderFinishedEvent const &decoderFinishEvent)
Updates finished state on host for all active requests.
-
void setExplicitDraftTokensInputs(decoder_batch::Input const &input)
Sets inputs for explicit draft tokens.
-
void forwardDispatch(decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType)
Calls decoders for tokens per engine step.
-
void forwardDecoder(SizeType32 step, decoder_batch::Output &output, decoder_batch::Input const &input, ForwardType forwardType)
Calls decoder for whole batch.
Private Members
-
std::size_t const mVocabSize
-
std::size_t const mVocabSizePadded
-
CudaStreamPtr mRuntimeStream
-
CudaStreamPtr mDecoderStream
-
BufferManager mBufferManager
-
DecoderFinishedEventPtr mDecoderFinishEvent
-
GptDecoderPtr mDecoder
-
DecodingInputPtr mJointDecodingInput
-
DecodingOutputPtr mJointDecodingOutput
-
std::vector<SizeType32> mNbSteps
-
std::vector<bool> mFinished
-
std::vector<SizeType32> mMaxNewTokens
-
std::vector<SizeType32> mBeamWidths
-
std::vector<SizeType32> mNumDecodingEngineTokens
-
SizeType32 mMaxSequenceLength = {}
-
SizeType32 mMaxAttentionWindow = {}
-
SizeType32 mSinkTokenLength = {}
-
SizeType32 mActualBatchSize = {}
-
SizeType32 mMaxDecodingDecoderTokens = {}
-
SizeType32 mMaxDecodingEngineTokens = {}
-
SpeculativeDecodingMode mSpeculativeDecodingMode
-
executor::DecodingMode mDecodingMode = {executor::DecodingMode::Auto()}
-
std::shared_ptr<DecodingOutput::BeamHypotheses> mOutputBeamHypotheses = {nullptr}
-
DecodingOutput::TensorPtr mCumLogProbsTmp
-
SizeType32 mNumSMs
-
GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream, SpeculativeDecodingMode const &speculativeDecodingMode, nvinfer1::DataType dtype)
-
class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched
-
namespace runtime
gptJsonConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GptJsonConfig
Public Functions
-
inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 gpusPerNode, ModelConfig modelConfig)
-
inline ModelConfig const &getModelConfig() const
-
inline ModelConfig &getModelConfigMutable()
-
inline std::string const &getName() const
-
inline std::string const &getVersion() const
-
inline std::string const &getPrecision() const
-
inline constexpr SizeType32 getTensorParallelism() const
-
inline constexpr SizeType32 getPipelineParallelism() const
-
inline constexpr SizeType32 getGpusPerNode() const
-
inline constexpr SizeType32 getWorldSize() const
-
std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const
-
inline std::string engineFilename(WorldConfig const &worldConfig) const
Public Static Functions
-
static GptJsonConfig parse(std::string const &json)
-
static GptJsonConfig parse(std::istream &json)
-
static GptJsonConfig parse(std::filesystem::path const &path)
Private Members
-
std::string const mName
-
std::string const mVersion
-
std::string const mPrecision
-
SizeType32 const mTensorParallelism
-
SizeType32 const mPipelineParallelism
-
SizeType32 const mGpusPerNode
-
ModelConfig mModelConfig
-
inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 gpusPerNode, ModelConfig modelConfig)
-
class GptJsonConfig
-
namespace runtime
gptSession.h
-
namespace tensorrt_llm
-
namespace batch_manager
-
namespace kv_cache_manager
-
namespace kv_cache_manager
-
namespace runtime
-
class GptSession
-
Public Functions
-
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, RawEngine const &rawEngine, LoggerPtr logger = nullptr)
- Parameters:
sessionConfig – Configuration of the session,
modelConfig – Description of the model,
worldConfig – Description of the environment,
rawEngine – The compiled TensorRT engine,
logger – The optional logger.
-
inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
-
inline GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)
-
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)
-
BufferManager const &getBufferManager() const
-
BufferManager::CudaStreamPtr getRuntimeStreamPtr() const
-
inline ModelConfig const &getModelConfig() const
-
inline WorldConfig const &getWorldConfig() const
-
inline int getDevice() const noexcept
-
inline bool getNormalizeLogProbs() const noexcept
This function performs the generation loop.
Given input tensors to read from, output tensors to populate, that member function can be produced or each sequence has reached completion (due to the production will run the generation loop until it reaches the maximum number of tokens that of “end-of-sequence” or a word in the list of “stop words”). The pseudo-code of that function looks like (member function names were changed to keep the presentation simple):
// Have all the sequences in the batch reached completion? bool allFinished = false; // Until all sequences are finished or the number of steps reaches the limit... for (int step = 0; !allFinished && step < maxNewTokens; ++step) { // Trigger the computation of the logits... computeLogits(...); // Run the sampling to produce a token (for each active sequence) from the logits. allFinished = generateTokensFromLogits(...); // Callback to stream the output tokens while the generation loop continues. onTokenGenerated(...); }
-
void setLayerProfiler()
Set LayerProfiler to collect performance per layer.
-
std::string getLayerProfileInfo() const
Print profile information per layer.
Private Types
-
using KvCacheManager = batch_manager::kv_cache_manager::KVCacheManager
-
using KvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig
-
using TokenGeneratedCallback = std::function<void(SizeType32 step, bool finished)>
Private Functions
-
inline bool useCudaGraphs()
-
void createContexts()
-
void createBuffers(SizeType32 numMicroBatches)
-
void createDecoders(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType32 numMicroBatches, executor::DecodingMode const &decodingMode)
-
void createKvCacheManager(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, KvCacheConfig const &config)
-
void createCustomAllReduceWorkspace(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSequenceLength)
-
void executeContextStep(std::vector<GenerationInput> const &generationBatchesInputs, std::vector<SizeType32> const &generationBatchesOffsets, KvCacheManager const *kvCacheManager)
-
SizeType32 executeGenerationStep(SizeType32 step, std::vector<GenerationInput> const µBatchesInputs, std::vector<GenerationOutput> µBatchesOutputs, std::vector<SizeType32> const µBatchOffsets, KvCacheManager *kvCacheManager, std::vector<bool> µBatchesFinished)
-
void decoderStepAsync(SizeType32 decoderStep, SizeType32 microBatchId)
Execute decoder on last PP rank, receive decoder output on other PP ranks.
-
bool shouldStopSync(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 microBatchId)
Synchronize with the decoder and return the
shouldStop
flag.
-
void finalize(SizeType32 microBatchId, SamplingConfig const &samplingConfig)
Collect final output ids and log probs on last PP rank and send them to first PP rank.
Receives are asynchronous on host, so synchronization is required before access.
-
void kvCacheAddSequences(SizeType32 beamWidth, SizeType32 microBatchId, SizeType32 firstBatchIdx)
-
ITensor::SharedPtr initDecoder(ITensor &outputIds, GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, SizeType32 microBatchId) const
Populate outputIds and return reference to newTokens tensor.
-
TokenGeneratedCallback createOnTokenGeneratedCallback(GenerationOutput &outputs)
-
bool shouldUseKVCacheManager() const
Private Members
-
ModelConfig const mModelConfig
-
WorldConfig const mWorldConfig
-
int mDevice = {-1}
-
std::shared_ptr<NcclCommunicator> mPipelineComm
-
std::shared_ptr<CudaStream> mCommStream
-
std::shared_ptr<AllReduceBuffers> mAllReduceBuffers
-
SizeType32 mDecoderMaxSequenceLength = {}
-
std::vector<SizeType32> mDecoderMaxAttentionWindowVec = {}
-
SizeType32 mDecoderMaxAttentionWindow = {}
-
SizeType32 mDecoderSinkTokenLength = {}
-
std::shared_ptr<TllmRuntime> mRuntime
-
std::shared_ptr<KvCacheManager> mKvCacheManager
-
MicroBatchConfig mMicroBatchConfig
-
std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders
-
std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers
-
bool mCudaGraphMode = {false}
-
std::vector<CudaGraphExecutor> mCudaGraphInstances
-
bool mNormalizeLogProbs = true
Friends
- friend class batch_manager::TrtGptModelV1
-
class Config
- #include <gptSession.h>
Configuration for session execution and buffer sizes.
generate
may be called with batch size and beam width smaller than the configured parameters.maxBatchSize
will be divided by the number of micro batches to initialize each batch buffer.Public Functions
-
inline Config(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, float gpuWeightsPercent = 1.0)
Public Members
-
SizeType32 maxBatchSize
-
SizeType32 maxBeamWidth
-
SizeType32 maxSequenceLength
-
float gpuWeightsPercent
-
bool decoderPerRequest = {false}
-
bool cudaGraphMode = {false}
-
KvCacheConfig kvCacheConfig = {}
-
std::optional<SizeType32> ctxMicroBatchSize = std::nullopt
-
std::optional<SizeType32> genMicroBatchSize = std::nullopt
-
std::optional<executor::DecodingMode> decodingMode = std::nullopt
-
bool normalizeLogProbs = true
-
inline Config(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, float gpuWeightsPercent = 1.0)
-
class CudaGraphExecutor
Public Functions
-
CudaGraphExecutor() = default
-
inline ~CudaGraphExecutor()
-
inline bool hasInstance()
-
void clear()
-
void prepareNextGraph(TllmRuntime const &runtime, SizeType32 nextContextId)
-
void launch(CudaStream const &stream)
Private Functions
-
void create(cudaGraph_t const &graph)
-
bool update(cudaGraph_t const &graph)
-
void uploadToStream(CudaStream const &stream)
Private Members
-
cudaGraphExec_t mInstance
-
CudaGraphExecutor() = default
-
class GenerationProfiler
- #include <gptSession.h>
Optional profiler class to profile the generation phase of an inference request.
Public Static Attributes
-
static constexpr unsigned int flags = {cudaEventDefault}
-
static constexpr unsigned int flags = {cudaEventDefault}
-
class MicroBatchConfig
Public Functions
-
inline MicroBatchConfig()
-
explicit MicroBatchConfig(SizeType32 maxBatchSize, SizeType32 pipelineParallelism, std::optional<SizeType32> genMicroBatchSize, std::optional<SizeType32> ctxMicroBatchSize)
-
inline constexpr SizeType32 numCtxPerGen() const
-
inline constexpr SizeType32 getGenGraphId(SizeType32 flipFlopId, SizeType32 generationBatchId) const
flip-flop between 2 graph instances for each generation batch.
Public Members
-
SizeType32 numCtxBatches
-
SizeType32 numGenBatches
-
SizeType32 ctxBatchSize
-
SizeType32 genBatchSize
-
inline MicroBatchConfig()
-
GptSession(Config const &sessionConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, RawEngine const &rawEngine, LoggerPtr logger = nullptr)
-
class GptSession
-
namespace batch_manager
iBuffer.h
-
template<>
struct MemoryTypeString<MemoryType::kGPU> Public Static Attributes
-
static constexpr auto value = "GPU"
-
static constexpr auto value = "GPU"
-
template<>
struct MemoryTypeString<MemoryType::kCPU> Public Static Attributes
-
static constexpr auto value = "CPU"
-
static constexpr auto value = "CPU"
-
template<>
struct MemoryTypeString<MemoryType::kPINNED> Public Static Attributes
-
static constexpr auto value = "PINNED"
-
static constexpr auto value = "PINNED"
-
template<>
struct MemoryTypeString<MemoryType::kUVM> Public Static Attributes
-
static constexpr auto value = "UVM"
-
static constexpr auto value = "UVM"
-
template<>
struct MemoryTypeString<MemoryType::kPINNEDPOOL> Public Static Attributes
-
static constexpr auto value = "PINNEDPOOL"
-
static constexpr auto value = "PINNEDPOOL"
-
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32> Public Types
-
using type = std::int32_t
-
using type = std::int32_t
-
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64> Public Types
-
using type = std::int64_t
-
using type = std::int64_t
-
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32, true> Public Types
-
using type = std::uint32_t
-
using type = std::uint32_t
-
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64, true> Public Types
-
using type = std::uint64_t
-
using type = std::uint64_t
-
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned> Public Types
-
using type = bool
-
using type = bool
-
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned> Public Types
-
using type = std::uint8_t
-
using type = std::uint8_t
-
template<>
struct TRTDataType<std::int8_t>
-
template<>
struct TRTDataType<std::int32_t>
-
template<>
struct TRTDataType<std::uint32_t> Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
-
template<>
struct TRTDataType<std::int64_t>
-
template<>
struct TRTDataType<std::uint64_t> Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
template<>
struct TRTDataType<std::uint8_t>
-
template<>
struct TRTDataType<kernels::KVCacheIndex> Public Static Attributes
-
static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value
-
static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value
-
template<>
struct TRTDataType<kernels::FinishedState> Public Static Attributes
-
static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value
-
static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value
-
namespace tensorrt_llm
-
namespace runtime
Typedefs
Enums
Functions
-
template<typename T>
T const *bufferCast(IBuffer const &buffer) Gets a typed pointer to the constant underlying data of the buffer.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
buffer – The buffer to get a pointer to.
- Returns:
A pointer to constant
T
.
-
template<typename T>
T *bufferCast(IBuffer &buffer) Gets a typed pointer to the underlying data of the buffer.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
buffer – The buffer to get a pointer to.
- Returns:
A pointer to
T
.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
bufferPtr – A possibly null shared ptr.
- Returns:
A pointer to T, possibly nullptr.
Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
bufferPtr – A possibly null shared ptr.
- Returns:
A pointer to const T, possibly nullptr.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
optionalBufferPtr – A possibly empty optional.
- Returns:
A pointer to T, possibly nullptr.
Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
optionalBufferPtr – A possibly empty optional.
- Returns:
A pointer to const T, possibly nullptr.
-
template<MemoryType T>
struct MemoryTypeString
- template<> kGPU >
Public Static Attributes
-
static constexpr auto value = "GPU"
-
static constexpr auto value = "GPU"
- template<> kCPU >
Public Static Attributes
-
static constexpr auto value = "CPU"
-
static constexpr auto value = "CPU"
- template<> kPINNED >
Public Static Attributes
-
static constexpr auto value = "PINNED"
-
static constexpr auto value = "PINNED"
- template<> kUVM >
Public Static Attributes
-
static constexpr auto value = "UVM"
-
static constexpr auto value = "UVM"
- template<> kPINNEDPOOL >
Public Static Attributes
-
static constexpr auto value = "PINNEDPOOL"
-
static constexpr auto value = "PINNEDPOOL"
-
template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false>
struct DataTypeTraits - #include <iBuffer.h>
For converting a TensorRT data type to a C++ data type.
- template<> kFLOAT >
Public Types
-
using type = float
-
using type = float
- template<> kHALF >
Public Types
-
using type = half
Public Static Attributes
-
static constexpr char name[] = "half"
-
static constexpr auto size = sizeof(type)
-
using type = half
- template<> kINT8 >
Public Types
-
using type = std::int8_t
Public Static Attributes
-
static constexpr char name[] = "int8"
-
static constexpr auto size = sizeof(type)
-
using type = std::int8_t
- template<> kINT32 >
Public Types
-
using type = std::int32_t
Public Static Attributes
-
static constexpr char name[] = "int32"
-
static constexpr auto size = sizeof(type)
-
using type = std::int32_t
- template<> kINT64 >
Public Types
-
using type = std::int64_t
Public Static Attributes
-
static constexpr char name[] = "int64"
-
static constexpr auto size = sizeof(type)
-
using type = std::int64_t
- template<> kINT32, true >
Public Types
-
using type = std::uint32_t
Public Static Attributes
-
static constexpr char name[] = "uint32"
-
static constexpr auto size = sizeof(type)
-
using type = std::uint32_t
- template<> kINT64, true >
Public Types
-
using type = std::uint64_t
Public Static Attributes
-
static constexpr char name[] = "uint64"
-
static constexpr auto size = sizeof(type)
-
using type = std::uint64_t
- template<bool kUnsigned> kBOOL, kUnsigned >
Public Types
-
using type = bool
Public Static Attributes
-
static constexpr char name[] = "bool"
-
static constexpr auto size = sizeof(type)
-
using type = bool
- template<bool kUnsigned> kUINT8, kUnsigned >
Public Types
-
using type = std::uint8_t
Public Static Attributes
-
static constexpr char name[] = "uint8"
-
static constexpr auto size = sizeof(type)
-
using type = std::uint8_t
-
template<nvinfer1::DataType kDataType, bool kUnsigned>
struct DataTypeTraits<kDataType, kUnsigned, true>
-
class BufferDataType
- #include <iBuffer.h>
A wrapper around
nvinfer1::DataType
that provides a support for pointer types.
-
template<typename T, bool = false>
struct TRTDataType - #include <iBuffer.h>
For converting a C++ data type to a TensorRT data type.
-
template<>
struct TRTDataType<float>
-
template<>
struct TRTDataType<half>
- template<> int8_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kINT8
-
static constexpr auto value = nvinfer1::DataType::kINT8
- template<> int32_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kINT32
-
static constexpr auto value = nvinfer1::DataType::kINT32
- template<> uint32_t >
Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
- template<> int64_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kINT64
-
static constexpr auto value = nvinfer1::DataType::kINT64
- template<> uint64_t >
Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
template<>
struct TRTDataType<bool>
- template<> uint8_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kUINT8
-
static constexpr auto value = nvinfer1::DataType::kUINT8
- template<> KVCacheIndex >
Public Static Attributes
-
static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value
-
static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value
- template<> FinishedState >
Public Static Attributes
-
static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value
-
static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value
-
template<>
struct TRTDataType<void*> Public Static Attributes
-
static constexpr auto value = BufferDataType::kTrtPointerType
-
static constexpr auto value = BufferDataType::kTrtPointerType
-
template<typename T>
struct TRTDataType<T*> Public Static Attributes
-
static constexpr auto value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}
Private Static Attributes
-
static constexpr auto kUnderlyingType = BufferDataType{TRTDataType<std::remove_const_t<T>, false>::value}
-
static constexpr auto value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}
-
class IBuffer
Subclassed by tensorrt_llm::runtime::ITensor
Public Types
Public Functions
-
virtual void *data() = 0
Returns a pointer to underlying array.
-
virtual void const *data() const = 0
Returns a pointer to underlying array.
-
inline virtual void *data(std::size_t index)
Returns a pointer to the underlying array at a given element index.
-
inline virtual void const *data(std::size_t index) const
Returns a pointer to the underlying array at a given element index.
-
virtual std::size_t getSize() const = 0
Returns the size (in number of elements) of the buffer.
-
inline virtual std::size_t getSizeInBytes() const
Returns the size (in bytes) of the buffer.
-
virtual std::size_t getCapacity() const = 0
Returns the capacity of the buffer.
-
virtual char const *getDataTypeName() const
-
virtual MemoryType getMemoryType() const = 0
Returns the memory type of the buffer.
-
virtual char const *getMemoryTypeName() const
-
virtual void resize(std::size_t newSize) = 0
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-
virtual void release() = 0
Releases the buffer. It will be reset to nullptr.
-
virtual ~IBuffer() = default
Public Static Functions
Creates a sliced view on the underlying
buffer
. The view will have the same data type asbuffer
.- Parameters:
buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.
- Returns:
A view on the
buffer
.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
Returns a view on the underlying
tensor
which can be independently resized.- Parameters:
tensor – The tensor to view.
- Returns:
A view on the
tensor
.
Returns a view on the underlying
tensor
with a different size.- Parameters:
tensor – The tensor to view.
size – The size of the view.
- Returns:
A view on the
tensor
.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, std::size_t size)
-
static UniquePtr wrap(void *data, DataType type, std::size_t size, std::size_t capacity)
Wraps the given
data
in anIBuffer
. TheIBuffer
will not own the underlyingdata
and cannot be resized beyondcapacity
.- Parameters:
data – The data to wrap.
type – The data type of the
data
.size – The size of the buffer.
capacity – The capacity of the buffer.
- Returns:
An
IBuffer
.
-
static MemoryType memoryType(void const *data)
Determine the memory type of a pointer.
-
virtual void *data() = 0
-
template<typename T>
class BufferRange : public tensorrt_llm::common::ArrayView<T> Public Types
-
using Base = tensorrt_llm::common::ArrayView<T>
-
using Base = tensorrt_llm::common::ArrayView<T>
-
template<typename T>
-
namespace runtime
iGptDecoderBatched.h
-
namespace tensorrt_llm
-
namespace runtime
-
class IGptDecoderBatched : public virtual tensorrt_llm::runtime::IStatefulGptDecoder
- #include <iGptDecoderBatched.h>
GPT decoder class with support for in-flight batching.
Subclassed by tensorrt_llm::runtime::GptDecoderBatched
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
using DecoderFinishedEventPtr = std::unique_ptr<decoder_batch::DecoderFinishedEvent const>
Public Functions
-
virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) = 0
Setup buffers for ExplicitDraftTokens decoding.
-
virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) = 0
Setup buffers for Lookahead decoding.
-
virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) = 0
Run one step for all requests without blocking the host process and return the token for synchronization.
-
virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &token, decoder_batch::Output &output, decoder_batch::Input const &input) = 0
Call decoder forwardSync and wait for the call to
forwardAsync
associated with a token to complete.
-
virtual void forwardSync(decoder_batch::DecoderFinishedEvent const &token) = 0
Wait for the call to
forwardAsync
associated with a token to complete.
-
inline virtual void forward(decoder_batch::Output &output, decoder_batch::Input const &input)
Run one step for all requests and wait for completion on the host.
-
virtual TensorPtr getIds(SizeType32 batchIdx) const = 0
- Parameters:
batchIdx – index of the batch
- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
batchIdx
, on gpu
-
virtual TensorPtr getGatheredIds(SizeType32 batchIdx) const = 0
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search in GptDecoderBatched It contains gathered token ids without padding, on gpu
-
virtual CudaEvent finalize(SizeType32 batchIdx, SamplingConfig const &samplingConfig, bool streaming) const = 0
Gather final beam search results for request
batchIdx
. Result will only be available after event returned.
-
virtual std::vector<bool> getFinished() const = 0
- Returns:
[batchSize (actual)], marks finished requests (per batch)
-
virtual TensorPtr getFinishReasons() const = 0
- Returns:
[batchSize, beamWidth], FinishedState value, on gpu
-
virtual TensorPtr getCumLogProbs() const = 0
- Returns:
[batchSize, beamWidth], cumulative log probabilities (per beam), on gpu
-
virtual TensorPtr getCumLogProbs(SizeType32 batchIdx) const = 0
- Returns:
[beamWidth], cumulative log probabilities (per beam) for request batchIdx, on gpu
-
virtual TensorPtr getLogProbs() const = 0
- Returns:
[batchSize, beamWidth, maxSeqLen], log probabilities (per beam), on gpu
-
virtual TensorPtr getLogProbs(SizeType32 batchIdx) const = 0
- Returns:
[beamWidth, maxSeqLen], cumulative log probabilities (per beam) for request batchIdx, on gpu
-
virtual std::vector<SizeType32> getNbSteps() const = 0
-
virtual executor::DecodingMode getDecodingMode() const = 0
-
virtual void newRequests(std::vector<SizeType32> const &seqSlots, std::vector<decoder_batch::Request> const &requests, std::vector<SamplingConfig> const &samplingConfigs) = 0
Initialize batched decoder at seqSlots with a new
requests
.
-
virtual TensorPtr getNextDraftTokens() const = 0
- Returns:
[batchSize, maxTokensPerStep-1], predicted draft tokens for next step, on gpu
-
virtual TensorPtr getPrevDraftTokensLengths() const = 0
- Returns:
[batchSize], predicted draft tokens lengths for previous step, on gpu
-
virtual TensorPtr getNextDraftTokensLengths() const = 0
- Returns:
[batchSize], predicted draft tokens lengths for next step, on gpu
Protected Functions
-
IGptDecoderBatched() = default
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
namespace decoder_batch
-
-
class Input
-
Public Functions
Public Members
-
std::vector<bool> active
-
std::optional<ExplicitDraftTokensBuffers::EngineOutputs> explicitDraftTokensInputs
-
std::optional<ExplicitDraftTokensBuffers::EngineInputs> explicitDraftTokensLastInputs
-
std::vector<bool> active
-
class Input
-
class IGptDecoderBatched : public virtual tensorrt_llm::runtime::IStatefulGptDecoder
-
namespace runtime
iStatefulGptDecoder.h
-
namespace tensorrt_llm
-
namespace batch_manager
-
namespace runtime
-
class IStatefulGptDecoder
- #include <iStatefulGptDecoder.h>
GPT decoder class with support for in-flight batching.
Subclassed by tensorrt_llm::runtime::IGptDecoderBatched
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
Public Functions
-
virtual void setup(executor::DecodingMode const &mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, SizeType32 maxTokensPerStep, nvinfer1::DataType dtype, ModelConfig const &modelConfig) = 0
Setup the decoder before calling
forward()
, also calls reshapeBuffers.
-
virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) = 0
Initialize the decoder with new batch of inputs.
-
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0
Run one step for all requests without blocking the host thread.
-
virtual void forwardSync() = 0
Wait for the last call to
forwardAsync
to complete.
-
inline virtual void forward(decoder::Output &output, decoder::Input const &input)
Run one step for all requests.
-
virtual void finalize(SamplingConfig const &samplingConfig) const = 0
Gather final beam search results for all requests.
-
virtual TensorPtr getIds() const = 0
- Returns:
[batchSize, beamWidth, maxSequenceLength], all token ids, on gpu
-
virtual TensorPtr getGatheredIds() const = 0
- Returns:
[batchSize, beamWidth, maxSequenceLength] token ids after gatherTree
-
virtual TensorPtr getCumLogProbs() const = 0
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
virtual TensorPtr getLogProbs() const = 0
- Returns:
[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
virtual TensorPtr getNewTokens(SizeType32 iter = 0) const = 0
Get tokens generated in one step of last forward pass.
- Parameters:
iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
- Returns:
[batchSize, beamWidth], tokens generated in
iter
(per beam), on gpu
-
virtual TensorPtr getAllNewTokens() const = 0
Get maxTokensPerStep tokens generated in the last forward pass.
- Returns:
[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
-
virtual TensorPtr getNbFinished() const = 0
- Returns:
[1], number of finished sequences, in pinned host memory
-
virtual ~IStatefulGptDecoder() = default
Protected Functions
-
IStatefulGptDecoder() = default
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
class IStatefulGptDecoder
-
namespace batch_manager
iTensor.h
-
namespace nvinfer1
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims)
Utility function to print a shape.
-
std::ostream &operator<<(std::ostream &output, ITensor const &tensor)
Utility function to print a tensor with its shape.
Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
tensorPtr – A possibly null shared ptr.
- Returns:
A pointer to T const, possibly nullptr.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
tensorPtr – A possibly null shared ptr.
- Returns:
A pointer to T, possibly nullptr.
Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
optionalBufferPtr – A possibly empty optional.
- Returns:
A pointer to T, possibly nullptr.
Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
- Template Parameters:
T – The type of the underlying data.
- Parameters:
optionalBufferPtr – A possibly empty optional.
- Returns:
A pointer to const T, possibly nullptr.
-
class ITensor : public virtual tensorrt_llm::runtime::IBuffer
-
Public Functions
-
~ITensor() override = default
-
template<SizeType32 n>
inline DimType64 getDimension() const Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.
-
virtual void reshape(Shape const &dims) = 0
Sets the tensor dimensions. The new size of the tensor will be
volume(dims)
-
inline virtual void resize(std::size_t newSize) override
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-
inline void squeeze(SizeType32 dim)
Removes the given unit dimensions from this tensor.
-
inline void unsqueeze(SizeType32 dim)
Adds a unit dimension at the specified position.
-
inline bool shapeEquals(std::initializer_list<SizeType32> const &other) const
-
template<typename T>
inline bool shapeEquals(T const *dims, SizeType32 count) const
Public Static Functions
-
static inline std::int64_t volume(Shape const &dims)
Returns the volume of the dimensions. Returns -1 if
d.nbDims < 0
.
-
static inline std::size_t volumeNonNegative(Shape const &shape)
Returns the volume of the dimensions. Throws if
d.nbDims < 0
.
-
static Shape squeeze(Shape const &shape, SizeType32 dim)
Removes the given unit dimension from
shape
.- Parameters:
shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).
- Returns:
A new shape without the unit dimension.
-
static Shape unsqueeze(Shape const &shape, SizeType32 dim)
Add a unit dimension to
shape
at the specified position.- Parameters:
shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.
- Returns:
A new shape with the added unit dimension.
Creates a sliced view on the underlying
tensor
. The view will have the same data type astensor
.- Parameters:
tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.
- Returns:
A view on the
buffer
.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
- Parameters:
offsetDims – The offset in multiple dimensions.
tensor – The tensor to view.
offsetDims – The offset dimensions of the view.
size – The size of the view w.r.t. the last dimension in offsetDims.
offsetDims – specifies all dimensions.
- Throws:
Whenever – offset overflows or the last dimension offset+size overflows.
- Returns:
A view of shape [size, the rest dimensions] or [size] when
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims, std::size_t size)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims, std::size_t size)
return the rest slices at the last dimension when
size
omitted.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, Shape const &offsetDims)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)
- Parameters:
offsetDims – specifies all dimensions.
- Returns:
Just the block at the point, with shape of [the rest dimensions] or [1] when
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr at(TConstPtr &&tensor, Shape const &offsetDims)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline ITensor::UniqueConstPtr at(TConstPtr &&tensor, std::initializer_list<DimType64> const &offsetDims)
Returns a view on the underlying
buffer
(or tensor) with the given shape.- Parameters:
tensor – The tensor to view.
shape – The shape of the view.
- Returns:
A view on the
tensor
.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)
Returns a view on the underlying
tensor
which can be independently reshaped.- Parameters:
tensor – The tensor to view.
- Returns:
A view on the
tensor
.
Returns a flattened view on the underlying
tensor
which can be independently reshaped.- Parameters:
tensor – The tensor to flatten.
sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.
- Returns:
A flatten view on the
tensor
.
-
static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)
Wraps the given
data
in anITensor
. TheITensor
will not own the underlyingdata
and cannot be reshaped beyondcapacity
.- Parameters:
data – The data to wrap.
type – The data type of the
data
.shape – The shape of the tensor.
capacity – The capacity of the buffer.
- Returns:
An
ITensor
.
-
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)
-
static Shape makeShape(std::initializer_list<DimType64> const &dims)
A convenience function to create a tensor shape with the given dimensions.
-
static std::string toString(Shape const &dims)
A convenience function for converting a tensor shape to a
string
.
-
static inline bool shapeEquals(Shape const &lhs, Shape const &rhs)
A convenience function to compare shapes.
-
template<typename T>
static inline bool shapeEquals(Shape const &lhs, T const *dims, SizeType32 count) A convenience function to compare shapes.
Protected Functions
-
ITensor() = default
-
~ITensor() override = default
-
inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims)
-
namespace runtime
ipcUtils.h
-
namespace tensorrt_llm
-
namespace runtime
-
class IpcMemory
-
Public Functions
-
IpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig, bool openIpc = true)
-
~IpcMemory()
-
inline std::vector<void*> const &getCommPtrs() const
Public Static Attributes
-
static constexpr size_t FLAGS_SIZE = (tensorrt_llm::kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)
Private Functions
-
void allocateIpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig)
-
void destroyIpcMemory()
-
IpcMemory(std::size_t bufferSize, BufferManager const &manager, WorldConfig const &worldConfig, bool openIpc = true)
-
class AllReduceBuffers
-
Public Functions
-
AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, SizeType32 hiddenSize, BufferManager const &manager, WorldConfig const &worldConfig)
-
AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxSequenceLength, SizeType32 hiddenSize, BufferManager const &manager, WorldConfig const &worldConfig)
-
class IpcMemory
-
namespace runtime
lookaheadBuffers.h
-
namespace tensorrt_llm
-
namespace runtime
-
class LookaheadDecodingBuffers
Public Types
-
using SizeType32 = runtime::SizeType32
-
using ITensor = tensorrt_llm::runtime::ITensor
Public Functions
-
LookaheadDecodingBuffers(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep, runtime::BufferManager const &bufferManager)
-
using SizeType32 = runtime::SizeType32
-
class LookaheadRuntimeBuffers
Public Types
-
using SizeType32 = tensorrt_llm::runtime::SizeType32
-
using ITensor = tensorrt_llm::runtime::ITensor
-
using TensorMap = runtime::StringPtrMap<runtime::ITensor>
Public Functions
-
LookaheadRuntimeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const &manager, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig, executor::DecodingConfig const &decodingConfig, runtime::TllmRuntime const &runtime)
-
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const &requestTypes, ITensor const &seqSlots, LookaheadDecodingBuffers const &decoderLookaheadBuffers, runtime::TllmRuntime const &runtime, runtime::ModelConfig const &modelConfig, runtime::WorldConfig const &worldConfig) const
-
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 tokensPerStep)
-
void insertInputTensors(TensorMap &inputBuffers, TensorMap &outputBuffers, runtime::WorldConfig const &worldConfig) const
Public Members
-
using SizeType32 = tensorrt_llm::runtime::SizeType32
-
class LookaheadDecodingBuffers
-
namespace runtime
lookaheadModule.h
-
namespace tensorrt_llm
-
namespace runtime
-
class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule
Public Functions
-
inline explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
-
inline explicit LookaheadModule() noexcept
-
inline void setExecutionConfig(executor::LookaheadDecodingConfig const &config)
-
inline executor::LookaheadDecodingConfig const getExecutionConfig() const
Private Members
-
executor::LookaheadDecodingConfig mExecutionConfig
-
inline explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
-
class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule
-
namespace runtime
loraCache.h
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
std::string to_string(LoraCache::TaskLayerModuleConfig const &v)
-
std::ostream &operator<<(std::ostream &os, LoraCache::TaskLayerModuleConfig const &v)
-
class LoraExpectedException : public std::runtime_error
Subclassed by tensorrt_llm::runtime::LoraCacheFullException
-
class LoraCacheFullException : public tensorrt_llm::runtime::LoraExpectedException
-
class LoraCachePageManager
- #include <loraCache.h>
Holds memory of lora cache pages, and manages allocation and freeing of whole pages. Memory is pre-allocated either on the host or device
Note that this class is not thread safe
Public Functions
-
LoraCachePageManager(LoraCachePageManagerConfig const &config, BufferManager const &bufferManager)
- Parameters:
config – [in] a LoraCachePageManagerConfig
bufferManager – [in] a Buffermanager used to allocate page blocks
-
std::optional<std::vector<std::size_t>> claimPages(SizeType32 numPages)
claim pages
- Parameters:
numPages – [in] number of pages to claim
- Returns:
a tuple, where the first values is a boolean indicating whether pages were claimed. If the first value is true the second value will have a list of pageIds
-
SizeType32 numAvailablePages() const
get number of available (free) pages in manager
- Returns:
number of free pages in manager
-
void releasePages(std::vector<std::size_t> const &pages)
release given pages
- Parameters:
pages – [in] list of pages to release (free)
-
ITensor::SharedConstPtr blockPtr(SizeType32 blockIdx) const
return pointer to given page block
- Parameters:
blockIdx; – [in]
- Returns:
— pointer to page block
-
ITensor::SharedConstPtr pagePtr(std::size_t pageIdx) const
return pointer to given page
- Parameters:
pageIdx – [in]
- Returns:
— const pointer to page
Private Functions
-
void initialize(BufferManager const &bufferManager)
-
LoraCachePageManager(LoraCachePageManagerConfig const &config, BufferManager const &bufferManager)
-
class LoraCache
- #include <loraCache.h>
Caches LoRA weights with LRU eviction policy.
Tasks put in the cache are marked in progress and can not be evicted, until they are marked done.
A cache page holds a optimally sized LoRA. A page is of size [numSlots x pageWidth] An optimally size LoRA is on that has the configured optimalAdapterSize.
Conceptually a slot corresponds to a r=1, 1-layer, 1-module set of in/out weights. Page width is set to the number of weights in smallest module.
The number of slots per page is then ceilDiv(num weights in optimally sized LoRA, num weights in smallest module)
Cache pages are allocated on one or more blocks
Public Types
-
using TaskIdType = std::uint64_t
-
using TaskLayerModuleConfigListPtr = std::shared_ptr<std::vector<TaskLayerModuleConfig>>
Public Functions
-
LoraCache(LoraCachePageManagerConfig const &pageManagerConfig, ModelConfig const &modelConfig, WorldConfig const &worldConfig, BufferManager const &bufferManager)
param[in] pageManagerConfig: a LoraCachePageManagerConfig param[in] modelConfig: a ModelConfig param[in] worldConfig: a WorldConfig param[in] bufferManager: a BufferManager only used to allocate page blocks
-
void put(TaskIdType taskId, TensorPtr weights, TensorPtr config, bool load = true)
put a task in the cache, and claim pages for it, and optionally load task weights.
- Parameters:
taskId – [in] the task id
weights – [in] lora weights tensor
config – [in] lora config tensor
load – [in] if true load weights before returning, otherwise do not
-
void loadWeights(TaskIdType taskId, TensorPtr weights, TensorPtr config)
load task weights. This method must be called after put. It is designed to be called asynchronously after put returns with load = false
- Parameters:
taslId – [in] the task id
weights – [in] lora weights tensor
config – [in] lora config tensor
-
inline bool isLoaded(TaskIdType taskId) const
- Parameters:
taskId – [in] the task id
- Returns:
— true if task is loaded (weights are in place) and false otherwise
-
bool isDone(TaskIdType taskId) const
- Parameters:
taskId – [in] the task id
- Returns:
— true if task is marked done and can be evicted
-
inline bool has(TaskIdType taskId) const
- Parameters:
taskId – [in] the task id
- Returns:
— true if task is in the cache (not necessarily loaded) and false otherwise
-
std::shared_ptr<std::vector<TaskLayerModuleConfig>> get(TaskIdType taskId)
- Parameters:
taskId – [in] the task id
- Returns:
— list of Value objects with pointers to task weights
-
void bump(TaskIdType taskId)
bump task and make it the most recently used
- Parameters:
taskId – [in] the task id
-
void markTaskDone(TaskIdType taskId)
mark task done meaning it can be evicted
- Parameters:
taskId – [in] the task id
-
void markAllDone()
mark all tasks in cache done
-
SizeType32 determineNumPages(TaskIdType taskId) const
- Parameters:
taskId – [in] the taskid
- Returns:
— number of pages needed to store the given task
-
SizeType32 determineNumPages(TensorPtr config) const
- Parameters:
config – [in] lora config tensor
- Returns:
— number of pages needed to store the task configured with config tensor
-
bool fits(TensorPtr config) const
- Parameters:
config – [in] a lora config tensor
- Returns:
— true in task fits in cache false otherwise
-
void copyTask(TaskIdType taskId, LoraCache &deviceCache, bool markDone = false)
copy task to another cache. Caches must have the same page size.
- Parameters:
taskId – [in] the task id to copy
otherCache – [in] the LoraCache to move the task to
markDone – [in] mark the copied task done as it’s copied
-
SizeType32 getNumPages() const
- Returns:
— total number of pages allocated to cache (used or not)
-
ITensor::SharedConstPtr getPagePtr(size_t pageId) const
- Parameters:
pageId – [in] the page id
- Returns:
— const pointer to page
Public Static Functions
-
static std::vector<LoraCache::TaskLayerModuleConfig> copyToPages(TensorPtr weights, TensorPtr config, ModelConfig const &modelConfig, WorldConfig const &worldConfig, std::unordered_map<SizeType32, LoraModule> moduleIdToModel, BufferManager const &manager, std::vector<TensorPtr> const &pages, std::vector<std::size_t> const &pageIds)
Copy task weights to cache pages.
- Parameters:
weights – [in] task weights
config – [in] task config tensor
modelConfig – [in] a ModelConfig
worldConfig – [in] a WorldConfig
modelIdToModel – [in] map from lora module id to LoraModule
manager – [in] a BufferManager the manager to use to perform the copies
pages – [out] list of page tensors to copy weights to
pageIds – [in] page ids for the pages
- Returns:
— list of cache Values objects
-
static void splitTransposeCpu(ITensor &output, ITensor const &input, SizeType32 tpSize, SizeType32 tpRank)
splits second dim of input into tpSize parts and writes the tpRank split to output
- Parameters:
output – [out] output tensor
input – [in] input tensor
tpSize – [in] number of splits
tpRank – [in] the split to write to output
Private Types
Private Functions
-
void bumpTaskInProgress(TaskIdType taskId)
-
ValueStatus getStatus(TaskIdType taskId) const
-
std::vector<std::size_t> claimPagesWithEvict(SizeType32 numPages)
claim numPages, evicting tasks if needed
- Parameters:
numPages – [in] number of pages to claim
- Throws:
std::runtime_error – if all pages cannot be claimed
- Returns:
— list of page ids
-
std::map<size_t, std::pair<size_t, SizeType32>> copyTaskMapPages(TaskValue &targetTaskValue, TaskValue const &sourceTaskValue, std::vector<size_t> const &targetPageIds, LoraCache const &targetCache)
Internal helper method used inside copyTask. Not thread safe on its own
Private Members
-
LoraCachePageManagerConfig mPageManagerConfig
-
ModelConfig mModelConfig
-
WorldConfig mWorldConfig
-
mutable std::mutex mPagesMutex
-
std::unique_ptr<LoraCachePageManager> mCachePageManager
-
mutable std::mutex mCacheMutex
-
std::unordered_map<TaskIdType, TaskValuePtr> mCacheMap
-
std::list<TaskIdType> mInProgressTasks
-
std::list<TaskIdType> mDoneTasks
-
std::vector<std::unique_ptr<BufferManager>> mDeviceBufferManagers
-
std::unique_ptr<BufferManager> mBufferManager
-
std::unordered_map<SizeType32, LoraModule> mModuleIdToModule
Private Static Functions
-
template<typename T>
static void splitTransposeCpuInner(ITensor &output, ITensor const &input, SizeType32 tpSize, SizeType32 tpRank)
-
struct TaskLayerModuleConfig
- #include <loraCache.h>
Contains information on a single layer / module. A list of these configs is associated with each task and can be used to populate runtime tensors.
Public Functions
-
std::string toString() const
-
bool operator==(LoraCache::TaskLayerModuleConfig const &o) const
Public Members
-
std::size_t pageId
-
SizeType32 slotIdx
-
SizeType32 inSize
-
SizeType32 outSize
-
SizeType32 moduleId
-
SizeType32 layerId
-
SizeType32 adapterSize
-
SizeType32 numSlots
-
std::int64_t weightsInPointer
-
std::int64_t weightsOutPointer
-
std::string toString() const
-
struct TaskValue
Holds configuration and state for a single task.
Public Functions
-
TaskValue() = delete
-
~TaskValue() = default
-
inline TaskValue(std::vector<std::size_t> const &pageIds, TaskLayerModuleConfigListPtr const &configs, std::list<TaskIdType>::iterator it, bool inProgress, bool loaded, bool done, bool loadInProgress = false)
Public Members
-
std::vector<std::size_t> pageIds
-
TaskLayerModuleConfigListPtr configs
-
std::list<TaskIdType>::iterator it
-
bool inProgress
-
bool loaded
-
bool done
Marks a task a done. This is used to mark a task as done during loading. if done=true at the end of loading (end of put, loadweights, or copyTask) the task will be marked as done
-
bool loadInProgress
Indicates weights are loading either in put or loadWeights This is used to block concurrent loadWeights calls for the same task.
-
TaskValue() = delete
-
using TaskIdType = std::uint64_t
-
std::string to_string(LoraCache::TaskLayerModuleConfig const &v)
-
namespace runtime
loraCachePageManagerConfig.h
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
inline std::ostream &operator<<(std::ostream &os, LoraCachePageManagerConfig const &c)
-
inline std::string to_string(LoraCachePageManagerConfig const &c)
-
class LoraCachePageManagerConfig
- #include <loraCachePageManagerConfig.h>
Configuration for LoraCachePageManager
See LoraCache docs for description of pages, slots, and page blocks.
Public Functions
-
inline explicit constexpr LoraCachePageManagerConfig(runtime::MemoryType memType, nvinfer1::DataType dType, SizeType32 totalNumPages, SizeType32 maxPagesPerBlock, SizeType32 slotsPerPage, SizeType32 pageWidth, SizeType32 numCopyStreams)
-
inline constexpr runtime::MemoryType getMemoryType() const noexcept
-
inline constexpr void setMemoryType(runtime::MemoryType const &memoryType) noexcept
-
inline constexpr SizeType32 getTotalNumPages() const noexcept
-
inline constexpr void setTotalNumPage(SizeType32 const &totalNumPages) noexcept
-
inline constexpr SizeType32 getMaxPagesPerBlock() const noexcept
-
inline constexpr void setMaxPagesPerBlock(SizeType32 const &maxPagesPerBlock) noexcept
-
inline constexpr SizeType32 getSlotsPerPage() const noexcept
-
inline constexpr void setSlotsPerPage(SizeType32 const &slotsPerPage) noexcept
-
inline constexpr SizeType32 getPageWidth() const noexcept
-
inline constexpr void setPageWidth(SizeType32 const &pageWidth) noexcept
-
inline constexpr bool getInitToZero() const noexcept
-
inline constexpr void setInitToZero(bool initToZero) noexcept
-
inline constexpr SizeType32 getNumCopyStreams() const noexcept
-
inline constexpr void setNumCopyStreams(SizeType32 numCopyStreams) noexcept
Private Members
-
runtime::MemoryType mMemoryType
-
SizeType32 mTotalNumPages
-
SizeType32 mMaxPagesPerBlock
-
SizeType32 mSlotsPerPage
-
SizeType32 mPageWidth
-
SizeType32 mNumCopyStreams = 1
-
bool mInitToZero
-
inline explicit constexpr LoraCachePageManagerConfig(runtime::MemoryType memType, nvinfer1::DataType dType, SizeType32 totalNumPages, SizeType32 maxPagesPerBlock, SizeType32 slotsPerPage, SizeType32 pageWidth, SizeType32 numCopyStreams)
-
inline std::ostream &operator<<(std::ostream &os, LoraCachePageManagerConfig const &c)
-
namespace runtime
loraModule.h
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
inline std::ostream &operator<<(std::ostream &output, LoraModule const &module)
-
class LoraModule
Public Types
-
enum class ModuleType : SizeType32
Values:
-
enumerator kINVALID
-
enumerator kATTN_QKV
-
enumerator kATTN_Q
-
enumerator kATTN_K
-
enumerator kATTN_V
-
enumerator kATTN_DENSE
-
enumerator kMLP_H_TO_4H
-
enumerator kMLP_4H_TO_H
-
enumerator kMLP_GATE
-
enumerator kCROSS_ATTN_QKV
-
enumerator kCROSS_ATTN_Q
-
enumerator kCROSS_ATTN_K
-
enumerator kCROSS_ATTN_V
-
enumerator kCROSS_ATTN_DENSE
-
enumerator kMOE_H_TO_4H
-
enumerator kMOE_4H_TO_H
-
enumerator kMOE_GATE
-
enumerator kMOE_ROUTER
-
enumerator kMLP_ROUTER
-
enumerator kINVALID
Public Functions
-
inline explicit constexpr LoraModule(ModuleType const &t, SizeType32 inDim, SizeType32 outDim, bool inDimFirst, bool outDimFirst, SizeType32 inTpSplitDim, SizeType32 outTpSplitDim) noexcept
-
inline explicit constexpr LoraModule() noexcept
-
explicit constexpr LoraModule(LoraModule const &o) = default
-
constexpr LoraModule &operator=(LoraModule const &o) = default
-
inline constexpr SizeType32 flattenedInOutSize(SizeType32 adapterSize) const noexcept
-
inline constexpr SizeType32 inSize(SizeType32 adapterSize) const noexcept
-
inline constexpr SizeType32 outSize(SizeType32 adapterSize) const noexcept
-
inline constexpr SizeType32 localInSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
-
inline constexpr SizeType32 localOutSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
-
inline constexpr SizeType32 localInDim(SizeType32 tpSize) const noexcept
-
inline constexpr SizeType32 localOutDim(SizeType32 tpSize) const noexcept
-
inline constexpr SizeType32 localInAdapterSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
-
inline constexpr SizeType32 localOutAdapterSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
-
inline constexpr SizeType32 localInOutSize(SizeType32 adapterSize, SizeType32 tpSize) const noexcept
-
inline constexpr SizeType32 value() const noexcept
-
inline constexpr std::string_view name() const noexcept
-
inline constexpr SizeType32 inDim() const noexcept
-
inline constexpr SizeType32 outDim() const noexcept
-
inline constexpr bool inDimFirst() const noexcept
-
inline constexpr bool outDimFirst() const noexcept
-
inline constexpr SizeType32 inTpSplitDim() const noexcept
-
inline constexpr SizeType32 outTpSplitDim() const noexcept
Public Static Functions
-
static std::vector<LoraModule> createLoraModules(std::vector<std::string> const &loraModuleNames, SizeType32 hiddenSize, SizeType32 mlpHiddenSize, SizeType32 numAttentionHeads, SizeType32 numKvAttentionHeads, SizeType32 attentionHeadSize, SizeType32 tpSize, SizeType32 numExperts)
-
static inline constexpr ModuleType toModuleType(std::string_view const &name)
-
static inline constexpr std::string_view toModuleName(ModuleType t) noexcept
-
static inline constexpr std::string_view toModuleName(SizeType32 id)
Private Members
-
ModuleType mType
-
SizeType32 mInDim
-
SizeType32 mOutDim
-
bool mInDimFirst
-
bool mOutDimFirst
-
SizeType32 mInTpSplitDim
-
SizeType32 mOutTpSplitDim
-
enum class ModuleType : SizeType32
-
inline std::ostream &operator<<(std::ostream &output, LoraModule const &module)
-
namespace runtime
medusaModule.h
-
namespace tensorrt_llm
-
namespace runtime
-
class MedusaModule : public tensorrt_llm::runtime::SpeculativeDecodingModule
Public Types
-
using MedusaChoices = std::vector<std::vector<SizeType32>>
Public Functions
-
inline explicit MedusaModule(SizeType32 maxAcceptedTokens, SizeType32 maxDraftTokens) noexcept
-
inline explicit MedusaModule() noexcept
-
inline MedusaChoices const &getMedusaChoices() const noexcept
-
void initMedusaTensorsFromChoices(MedusaChoices const &choices, std::vector<SizeType32> &topKs, TensorPtr &generationInputLengths, TensorPtr &positionOffsets, TensorPtr &treeIds, TensorPtr &paths, TensorPtr &packedMask, SizeType32 &totalPaths) const noexcept
Private Types
-
using Prefix = uint64_t
Private Functions
-
SizeType32 computePathsAndMask(std::vector<MedusaTreeNode> const &tree, TensorPtr &packedMask, TensorPtr &paths) const
-
void copyPackedMask(TensorPtr &mask, SizeType32 srcIdx, SizeType32 dstIdx) const
-
void setOnePackedMask(TensorPtr &mask, SizeType32 row, SizeType32 col) const
-
Prefix computePrefix(std::vector<SizeType32> const &vec, SizeType32 len) const
-
void dumpChoices(MedusaChoices const &choices, std::vector<SizeType32> const &indices) const
Private Members
-
MedusaChoices mDefaultMedusaChoices = {{0}, {0, 0}, {1}, {0, 1}, {2}, {0, 0, 0}, {1, 0}, {0, 2}, {3}, {0, 3}, {4}, {0, 4}, {2, 0}, {0, 5}, {0, 0, 1}, {5}, {0, 6}, {6}, {0, 7}, {0, 1, 0}, {1, 1}, {7}, {0, 8}, {0, 0, 2}, {3, 0}, {0, 9}, {8}, {9}, {1, 0, 0}, {0, 2, 0}, {1, 2}, {0, 0, 3}, {4, 0}, {2, 1}, {0, 0, 4}, {0, 0, 5}, {0, 0, 0, 0}, {0, 1, 1}, {0, 0, 6}, {0, 3, 0}, {5, 0}, {1, 3}, {0, 0, 7}, {0, 0, 8}, {0, 0, 9}, {6, 0}, {0, 4, 0}, {1, 4}, {7, 0}, {0, 1, 2}, {2, 0, 0}, {3, 1}, {2, 2}, {8, 0}, {0, 5, 0}, {1, 5}, {1, 0, 1}, {0, 2, 1}, {9, 0}, {0, 6, 0}, {0, 0, 0, 1}, {1, 6}, {0, 7, 0}}
Private Static Attributes
-
static constexpr SizeType32 PREFIX_CHUNK_SIZE_BITS = 4
-
static constexpr SizeType32 PREFIX_MAX_VALUE = 16
-
struct MedusaTreeNode
Public Members
-
SizeType32 nodeId
-
SizeType32 depth
-
SizeType32 parentLinearIdx
-
SizeType32 linearIdx
-
std::vector<SizeType32> childLinearIndices
-
SizeType32 nodeId
-
using MedusaChoices = std::vector<std::vector<SizeType32>>
-
class MedusaModule : public tensorrt_llm::runtime::SpeculativeDecodingModule
-
namespace runtime
memoryCounters.h
-
namespace tensorrt_llm
-
namespace runtime
-
class MemoryCounters
-
Public Functions
-
MemoryCounters() = default
-
inline SizeType32 getGpu() const
-
inline SizeType32 getCpu() const
-
inline SizeType32 getPinned() const
-
inline SizeType32 getUVM() const
-
inline SizeType32 getPinnedPool() const
-
template<MemoryType T>
inline void allocate(SizeType32 size)
-
void allocate(MemoryType memoryType, SizeType32 size)
-
template<MemoryType T>
inline void deallocate(SizeType32 size)
-
void deallocate(MemoryType memoryType, SizeType32 size)
-
std::string toString() const
Public Static Functions
-
static MemoryCounters &getInstance()
-
static std::string bytesToString(SizeType32 bytes, int precision = 2)
Private Members
-
std::atomic<SizeType32> mGpu = {}
-
std::atomic<SizeType32> mCpu = {}
-
std::atomic<SizeType32> mPinned = {}
-
std::atomic<SizeType32> mUVM = {}
-
std::atomic<SizeType32> mPinnedPool = {}
-
MemoryCounters() = default
-
class MemoryCounters
-
namespace runtime
modelConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class ModelConfig
Public Types
-
enum class ModelVariant : std::int32_t
Values:
-
enumerator kGpt
-
enumerator kChatGlm
-
enumerator kGlm
-
enumerator kMamba
-
enumerator kRecurrentGemma
-
enumerator kEncDec
-
enumerator kGpt
-
enum class LayerType : std::int32_t
Values:
-
enumerator kATTENTION
-
enumerator kRECURRENT
-
enumerator kLINEAR
-
enumerator kNOOP
-
enumerator kATTENTION
Public Functions
-
inline explicit ModelConfig(SizeType32 vocabSize, SizeType32 nbLayers, SizeType32 nbAttentionLayers, SizeType32 nbRnnLayers, SizeType32 nbHeads, SizeType32 hiddenSize, nvinfer1::DataType dtype)
-
inline constexpr SizeType32 getVocabSize() const noexcept
-
inline constexpr SizeType32 getVocabSizePadded(SizeType32 worldSize) const noexcept
-
inline SizeType32 countLocalLayers(LayerType layerType, SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
-
inline SizeType32 countLowerRankLayers(LayerType layerType, SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
-
inline SizeType32 getNbLayers(SizeType32 pipelineParallelism = 1) const
-
inline SizeType32 getNbAttentionLayers(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
-
inline SizeType32 getNbRnnLayers(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
-
inline constexpr SizeType32 getNbHeads() const noexcept
-
inline SizeType32 getNbKvHeads(SizeType32 layerIdx) const
-
inline void setNbKvHeads(SizeType32 nbKvHeads)
-
inline constexpr SizeType32 getHiddenSize() const noexcept
-
inline constexpr SizeType32 getEncoderHiddenSize() const noexcept
-
inline constexpr void setEncoderHiddenSize(SizeType32 encoderHiddenSize) noexcept
-
inline constexpr SizeType32 getSizePerHead() const noexcept
-
inline constexpr void setSizePerHead(SizeType32 sizePerHead) noexcept
-
inline constexpr bool useGptAttentionPlugin() const noexcept
-
inline constexpr void useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept
-
inline constexpr bool useMambaConv1dPlugin() const noexcept
-
inline constexpr void useMambaConv1dPlugin(bool useMambaConv1dPlugin) noexcept
-
inline constexpr bool usePackedInput() const noexcept
-
inline constexpr void usePackedInput(bool inputPacked) noexcept
-
inline constexpr bool usePagedState() const noexcept
-
inline constexpr void usePagedState(bool pagedState) noexcept
-
inline constexpr SizeType32 getTokensPerBlock() const noexcept
-
inline constexpr void setTokensPerBlock(SizeType32 TokensPerBlock) noexcept
-
inline constexpr common::QuantMode getQuantMode() const noexcept
-
inline constexpr void setQuantMode(common::QuantMode QuantMode) noexcept
-
inline constexpr bool supportsInflightBatching() const noexcept
-
inline constexpr SizeType32 getMaxBatchSize() const noexcept
-
inline constexpr void setMaxBatchSize(SizeType32 maxBatchSize) noexcept
-
inline constexpr SizeType32 getMaxBeamWidth() const noexcept
-
inline constexpr void setMaxBeamWidth(SizeType32 maxBeamWidth) noexcept
-
inline constexpr SizeType32 getMaxInputLen() const noexcept
-
inline constexpr void setMaxInputLen(SizeType32 maxInputLen) noexcept
-
inline constexpr SizeType32 getMaxSequenceLen() const noexcept
-
inline constexpr void setMaxSequenceLen(SizeType32 maxSequenceLen) noexcept
-
inline constexpr std::optional<SizeType32> getMaxNumTokens() const noexcept
-
inline constexpr void setMaxNumTokens(std::optional<SizeType32> maxNumTokens) noexcept
-
inline constexpr SizeType32 getMaxEncoderLen() const noexcept
-
inline constexpr void setMaxEncoderLen(SizeType32 maxEncoderLen) noexcept
-
inline constexpr bool usePromptTuning() const noexcept
-
inline constexpr SizeType32 getMaxPromptEmbeddingTableSize() const noexcept
-
inline constexpr void setMaxPromptEmbeddingTableSize(SizeType32 maxPromptEmbeddingTableSize) noexcept
-
inline constexpr bool computeContextLogits() const noexcept
-
inline constexpr void computeContextLogits(bool computeContextLogits) noexcept
-
inline constexpr bool computeGenerationLogits() const noexcept
-
inline constexpr void computeGenerationLogits(bool computeGenerationLogits) noexcept
-
inline ModelVariant getModelVariant() const
-
inline void setModelVariant(ModelVariant modelVariant)
-
inline SizeType32 getMaxDecodingDraftTokens() const
-
inline constexpr SizeType32 getMaxDecodingTokens() const noexcept
-
inline constexpr void setContextFMHA(bool contextFMHA) noexcept
-
inline constexpr bool getContextFMHA() const noexcept
-
inline constexpr void setPagedContextFMHA(bool pagedContextFMHA) noexcept
-
inline constexpr bool getPagedContextFMHA() const noexcept
-
inline constexpr void useXQA(bool useXQA) noexcept
-
inline constexpr bool useXQA() const noexcept
-
inline constexpr bool useLoraPlugin() const noexcept
-
inline constexpr void useLoraPlugin(bool useLoraPlugin) noexcept
-
inline std::vector<LoraModule> const &getLoraModules() const noexcept
-
inline void setLoraModules(std::vector<LoraModule> const &loraModules) noexcept
-
inline constexpr SizeType32 getMlpHiddenSize() const noexcept
-
inline constexpr void setMlpHiddenSize(SizeType32 mlpHiddenSize) noexcept
-
inline constexpr bool isKVCacheEnabled() const noexcept
-
inline constexpr bool isPagedKVCache() const noexcept
-
inline constexpr bool isContinuousKVCache() const noexcept
-
inline constexpr KVCacheType getKVCacheType() const noexcept
-
inline constexpr void setKVCacheType(KVCacheType kvCacheType) noexcept
-
inline constexpr bool useCrossAttention() const noexcept
-
inline constexpr void setUseCrossAttention(bool useCrossAttention) noexcept
-
inline constexpr bool usePositionEmbedding() const noexcept
-
inline constexpr void setUsePositionEmbedding(bool usePositionEmbedding) noexcept
-
inline constexpr bool useTokenTypeEmbedding() const noexcept
-
inline constexpr void setUseTokenTypeEmbedding(bool useTokenTypeEmbedding) noexcept
-
inline constexpr SizeType32 getMaxLoraRank() const noexcept
-
inline constexpr void setMaxLoraRank(SizeType32 maxLoraRank) noexcept
-
inline void setSpeculativeDecodingMode(SpeculativeDecodingMode mode) noexcept
-
inline bool hasSpeculativeDecodingModule() const noexcept
-
inline SpeculativeDecodingModule const &getSpeculativeDecodingModule() const noexcept
-
inline std::shared_ptr<SpeculativeDecodingModule const> getSpeculativeDecodingModulePtr() const noexcept
-
inline std::shared_ptr<SpeculativeDecodingModule> getSpeculativeDecodingModulePtr() noexcept
-
inline constexpr bool isTransformerBased() const noexcept
-
inline bool hasRnnConfig() const noexcept
-
inline constexpr bool isRnnBased() const noexcept
-
inline constexpr SpeculativeDecodingMode getSpeculativeDecodingMode() const noexcept
-
inline void setUseShapeInference(bool useShapeInference) noexcept
-
inline bool useShapeInference() const noexcept
-
inline ManageWeightsType getManageWeightsType() const noexcept
-
inline void setManageWeightsType(const ManageWeightsType manageWeightType) noexcept
-
inline std::string const &getModelName() const noexcept
-
inline void setModelName(std::string const &modelName)
-
inline std::vector<SizeType32> const &getNumKvHeadsPerLayer() const
-
inline std::pair<std::vector<SizeType32>::const_iterator, std::vector<SizeType32>::const_iterator> getNumKvHeadsPerLayerLocalRange(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
-
inline void setNumKvHeadsPerLayer(std::vector<SizeType32> const &headsPerLayer)
-
inline SizeType32 getSumLocalKvHeads(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
Public Static Functions
-
static inline KVCacheType KVCacheTypeFromString(std::string value)
-
static inline std::vector<SizeType32> getOptProfilesSplitPoints() noexcept
Public Static Attributes
-
static constexpr std::array kOPT_PROFILES_SPLIT_POINTS = {64, 128, 256, 512, 1024}
Private Members
-
SizeType32 mVocabSize
-
SizeType32 mNbLayers
-
SizeType32 mNbAttentionLayers
-
SizeType32 mNbRnnLayers
-
SizeType32 mNbHeads
-
SizeType32 mHiddenSize
-
SizeType32 mSizePerHead
-
bool mUseGptAttentionPlugin
-
bool mUseMambaConv1dPlugin
-
bool mInputPacked
-
bool mPagedState
-
SizeType32 mTokensPerBlock
-
common::QuantMode mQuantMode
-
SizeType32 mMaxBatchSize
-
SizeType32 mMaxBeamWidth
-
SizeType32 mMaxInputLen
-
SizeType32 mMaxSequenceLen
-
std::optional<SizeType32> mMaxNumTokens
-
bool mComputeContextLogits
-
bool mComputeGenerationLogits
-
ModelVariant mModelVariant
-
SizeType32 mMaxPromptEmbeddingTableSize
-
bool mContextFMHA
-
bool mPagedContextFMHA
-
bool mUseXQA
-
bool mUseLoraPlugin
-
std::vector<LoraModule> mLoraModules
-
SizeType32 mMlpHiddenSize
-
SizeType32 mMaxLoraRank
-
KVCacheType mKVCacheType = KVCacheType::kCONTINUOUS
-
SizeType32 mMaxEncoderLen = {}
-
SizeType32 mEncoderHiddenSize = {}
-
bool mUseCrossAttention
-
bool mUsePositionEmbedding
-
bool mUseTokenTypeEmbedding
-
std::shared_ptr<SpeculativeDecodingModule> mSpeculativeDecodingModule
-
SpeculativeDecodingMode mSpeculativeDecodingMode
-
bool mUseShapeInference
-
ManageWeightsType mManageWeightsType
-
std::string mModelName
-
std::vector<SizeType32> mNumKvHeadsPerAttentionLayer
-
struct RnnConfig
Public Members
-
SizeType32 stateSize = 0
-
SizeType32 convKernel = 0
-
SizeType32 rnnHiddenSize = 0
-
SizeType32 rnnHeadSize = 0
-
SizeType32 rnnConvDimSize = 0
-
SizeType32 stateSize = 0
-
enum class ModelVariant : std::int32_t
-
class ModelConfig
-
namespace runtime
promptTuningParams.h
-
namespace tensorrt_llm
-
namespace runtime
-
template<typename TTensor>
class GenericPromptTuningParams -
Public Functions
-
class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<ITensor::SharedPtr>
Public Types
-
using SizeType32 = GenericPromptTuningParams::SizeType32
Public Functions
-
inline explicit PromptTuningParams(TensorPtr embeddingTable = nullptr, TensorPtr tasks = nullptr, TensorPtr vocabSize = nullptr)
-
void fillTasksTensor(TensorPtr tasksHost, const SizeType32 batchSize, const SizeType32 numContextRequests, std::vector<SizeType32> const &reqBeamWidths, std::vector<SizeType32> const &reqPromptLengths, BufferManager const &manager, bool packedInput)
-
using SizeType32 = GenericPromptTuningParams::SizeType32
-
template<typename TTensor>
-
namespace runtime
rawEngine.h
-
namespace tensorrt_llm
-
namespace runtime
-
class RawEngine
Public Types
Public Functions
-
inline explicit RawEngine(std::filesystem::path enginePath) noexcept
-
inline explicit RawEngine(void const *engineAddr, std::size_t engineSize) noexcept
-
inline std::filesystem::path getPath() const
-
inline std::optional<std::filesystem::path> getPathOpt() const
-
inline void setPath(std::filesystem::path enginePath)
-
inline std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> const &getManagedWeightsMapOpt() const
-
inline void setManagedWeightsMap(std::map<std::string, tensorrt_llm::executor::Tensor> managedWeightsMap)
-
inline void const *getAddress() const
-
inline std::size_t getSize() const
-
inline explicit RawEngine(std::filesystem::path enginePath) noexcept
-
class RawEngine
-
namespace runtime
request.h
-
namespace tensorrt_llm
-
namespace runtime
-
namespace decoder_batch
-
class Request
Public Types
-
using TensorConstPtr = ITensor::SharedConstPtr
Public Functions
-
inline explicit Request(TensorConstPtr ids, SizeType32 inputLen, std::optional<SizeType32> maxNewTokens = std::nullopt, std::optional<SizeType32> endId = std::nullopt)
Public Members
-
TensorConstPtr ids
-
SizeType32 inputLen
-
std::optional<SizeType32> maxNewTokens
-
std::optional<SizeType32> endId
-
SizeType32 generatedTokensPerEngineStep
-
std::optional<executor::LookaheadDecodingConfig> lookaheadRuntimeConfig
-
using TensorConstPtr = ITensor::SharedConstPtr
-
class Request
-
namespace decoder_batch
-
namespace runtime
samplingConfig.h
Defines
-
SET_FROM_OPTIONAL(varName, VarName, VarType)
-
namespace tensorrt_llm
-
namespace runtime
-
class SamplingConfig
Public Functions
-
inline explicit SamplingConfig(SizeType32 beamWidth = 1)
-
inline explicit SamplingConfig(std::vector<SamplingConfig> const &configs)
-
inline explicit SamplingConfig(executor::SamplingConfig const &samplingConfig, std::optional<executor::ExternalDraftTokensConfig> const &externalDraftTokensConfig)
-
inline bool validate()
-
inline bool operator==(SamplingConfig const &other) const
Public Members
-
SizeType32 beamWidth
-
OptVec<SizeType32> minLength
-
OptVec<SizeType32> noRepeatNgramSize
-
OptVec<SizeType32> topK
-
OptVec<TokenIdType> topPResetIds
-
OptVec<SizeType32> earlyStopping
-
OptVec<std::vector<runtime::SizeType32>> topKMedusaHeads
-
std::optional<bool> normalizeLogProbs
Private Types
-
using FloatType = float
Private Functions
-
inline explicit SamplingConfig(SizeType32 beamWidth = 1)
-
class SamplingConfig
-
namespace runtime
speculativeDecodingMode.h
-
namespace tensorrt_llm
-
namespace runtime
-
class SpeculativeDecodingMode
Public Types
-
using UnderlyingType = std::uint8_t
Public Functions
-
inline constexpr bool isNone() const
-
inline constexpr bool isDraftTokensExternal() const
-
inline constexpr bool isMedusa() const
-
inline constexpr bool isLookaheadDecoding() const
-
inline constexpr bool isExplicitDraftTokens() const
-
inline constexpr bool updatesPositionIds() const
-
inline constexpr bool requiresAttentionMask() const
-
inline constexpr bool predictsDraftTokens() const
-
inline constexpr bool needsKVCacheRewind() const
-
inline constexpr bool variableDraftLength() const
-
inline constexpr bool hasDraftLogits() const
-
inline constexpr bool needsDecoderPrologue() const
-
inline bool operator==(SpeculativeDecodingMode const &other) const
-
inline explicit constexpr SpeculativeDecodingMode(UnderlyingType state)
Public Static Functions
-
static inline constexpr auto None()
-
static inline constexpr auto DraftTokensExternal()
-
static inline constexpr auto Medusa()
-
static inline constexpr auto LookaheadDecoding()
-
static inline constexpr auto ExplicitDraftTokens()
Private Functions
-
inline constexpr bool anyBitSet(UnderlyingType bits) const
-
inline constexpr bool allBitSet(UnderlyingType bits) const
Private Members
-
UnderlyingType mState = {kNone}
Private Static Attributes
-
static constexpr UnderlyingType kNone = {1U << 0U}
-
static constexpr UnderlyingType kDraftTokensExternal = {1U << 1U}
-
static constexpr UnderlyingType kMedusa = {1U << 2U}
-
static constexpr UnderlyingType kLookaheadDecoding = {1U << 3U}
-
static constexpr UnderlyingType kExplicitDraftTokens = {1U << 4U}
-
using UnderlyingType = std::uint8_t
-
class SpeculativeDecodingMode
-
namespace runtime
speculativeDecodingModule.h
-
namespace tensorrt_llm
-
namespace runtime
-
class SpeculativeDecodingModule
Subclassed by tensorrt_llm::runtime::LookaheadModule, tensorrt_llm::runtime::MedusaModule
Public Functions
-
inline explicit SpeculativeDecodingModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens, SizeType32 maxNumPaths) noexcept
-
inline explicit SpeculativeDecodingModule() noexcept
-
virtual ~SpeculativeDecodingModule() = default
-
SpeculativeDecodingModule(SpeculativeDecodingModule const &o) = default
-
SpeculativeDecodingModule &operator=(SpeculativeDecodingModule const &o) = default
-
inline SizeType32 getMaxDraftPathLen() const noexcept
- Returns:
max number of draft tokens that can be accepted by one step of the decoder
-
inline SizeType32 getMaxPathLen() const noexcept
one more than draft path len for prediction from primary head
- Returns:
max number of tokens that a request can grow in one step of the decoder
-
inline SizeType32 getMaxDecodingDraftTokens() const noexcept
- Returns:
max number of draft tokens processed by one step of the decoder
-
inline SizeType32 getMaxDecodingTokens() const noexcept
one more than decoding draft tokens for prediction from primary head
- Returns:
max number of tokens processed by one step of the decoder
-
inline SizeType32 getNumPackedMasks() const noexcept
-
inline SizeType32 getMaxNumPaths() const noexcept
-
inline void setMaxDraftTokens(SizeType32 maxDraftTokens) noexcept
-
inline void setMaxDraftPathLen(SizeType32 maxDraftPathLen) noexcept
-
inline void setMaxNumPaths(SizeType32 maxNumPaths) noexcept
Private Functions
-
inline void computeNumPackedMasks() noexcept
Private Members
-
SizeType32 mMaxDraftPathLen
-
SizeType32 mMaxDecodingDraftTokens
-
SizeType32 mMaxNumPaths
-
SizeType32 mMaxNumPackedMasks
-
inline explicit SpeculativeDecodingModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens, SizeType32 maxNumPaths) noexcept
-
class SpeculativeDecodingModule
-
namespace runtime
tllmLogger.h
worldConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class WorldConfig
Public Functions
-
explicit WorldConfig(SizeType32 tensorParallelism = 1, SizeType32 pipelineParallelism = 1, SizeType32 rank = 0, SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)
-
inline constexpr SizeType32 getSize() const noexcept
-
inline constexpr SizeType32 getTensorParallelism() const noexcept
-
inline constexpr bool isTensorParallel() const noexcept
-
inline constexpr SizeType32 getPipelineParallelism() const noexcept
-
inline constexpr bool isPipelineParallel() const noexcept
-
inline constexpr SizeType32 getRank() const noexcept
-
inline constexpr SizeType32 getGpusPerNode() const noexcept
-
inline SizeType32 getGpusPerGroup() const noexcept
-
inline SizeType32 getDevice() const noexcept
-
inline SizeType32 getDeviceOf(SizeType32 rank) const noexcept
-
inline constexpr SizeType32 getPipelineParallelRank() const noexcept
-
inline constexpr SizeType32 getTensorParallelRank() const noexcept
-
inline constexpr SizeType32 getLocalRank() const noexcept
-
inline constexpr SizeType32 getNodeRank() const noexcept
-
inline constexpr SizeType32 getNodeRankOf(SizeType32 rank) const noexcept
-
inline constexpr bool isFirstPipelineParallelRank() const noexcept
-
inline constexpr bool isLastPipelineParallelRank() const noexcept
Is my rank the last rank in its pipeline?
-
inline constexpr bool isFirstTensorParallelRank() const noexcept
-
inline constexpr SizeType32 getLastRank() const noexcept
-
std::vector<SizeType32> getPipelineParallelGroup() const
-
std::vector<SizeType32> getTensorParallelGroup() const
-
bool validMpiConfig() const
Public Static Functions
-
static WorldConfig mpi(SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType32> tensorParallelism = std::nullopt, std::optional<SizeType32> pipelineParallelism = std::nullopt, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)
Public Static Attributes
-
static constexpr SizeType32 kDefaultGpusPerNode = 1
Private Members
-
SizeType32 mTensorParallelism
-
SizeType32 mPipelineParallelism
-
SizeType32 mRank
-
SizeType32 mGpusPerNode
-
std::vector<SizeType32> mDeviceIds
-
explicit WorldConfig(SizeType32 tensorParallelism = 1, SizeType32 pipelineParallelism = 1, SizeType32 rank = 0, SizeType32 gpusPerNode = kDefaultGpusPerNode, std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt)
-
class WorldConfig
-
namespace runtime