Runtime#
iTensor.h#
- 
namespace nvinfer1#
- 
namespace tensorrt_llm
- 
namespace runtime
- Functions - inline std::ostream &operator<<( )#
- Utility function to print a shape. 
 
 - std::ostream &operator<<(
- std::ostream &output,
- ITensor const &tensor,
- Utility function to print a tensor with its shape. 
 
 - ITensor::SharedConstPtr const &tensorPtr,
- Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null. - This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved. - Template Parameters:
- T – The type of the underlying data. 
- Parameters:
- tensorPtr – A possibly null shared ptr. 
- Returns:
- A pointer to T const, possibly nullptr. 
 
 
 - 
)#
- Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null. - This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved. - Template Parameters:
- T – The type of the underlying data. 
- Parameters:
- tensorPtr – A possibly null shared ptr. 
- Returns:
- A pointer to T, possibly nullptr. 
 
 
 - 
)#
- Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value. - This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved. - Template Parameters:
- T – The type of the underlying data. 
- Parameters:
- optionalBufferPtr – A possibly empty optional. 
- Returns:
- A pointer to T, possibly nullptr. 
 
 
 - std::optional<ITensor::SharedConstPtr> const &optionalTensorPtr,
- Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value. - This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved. - Template Parameters:
- T – The type of the underlying data. 
- Parameters:
- optionalBufferPtr – A possibly empty optional. 
- Returns:
- A pointer to const T, possibly nullptr. 
 
 
 - 
class ITensor : public virtual tensorrt_llm::runtime::IBuffer#
- 
Public Functions - 
~ITensor() override = default#
 - 
template<SizeType32 n>
 inline DimType64 getDimension() const#
- Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20. 
 - 
virtual void reshape(Shape const &dims) = 0#
- Sets the tensor dimensions. The new size of the tensor will be - volume(dims)
 - 
inline virtual void resize(std::size_t newSize) override#
- Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity. 
 - 
inline void squeeze(SizeType32 dim)#
- Removes the given unit dimensions from this tensor. 
 - 
inline void unsqueeze(SizeType32 dim)#
- Adds a unit dimension at the specified position. 
 - inline bool shapeEquals(
- std::initializer_list<SizeType32> const &other,
 
 - 
template<typename T>
 inline bool shapeEquals(
- T const *dims,
- SizeType32 count,
 
 Public Static Functions - 
static inline std::int64_t volume(Shape const &dims)#
- Returns the volume of the dimensions. Returns -1 if - d.nbDims < 0.
 - 
static inline std::size_t volumeNonNegative(Shape const &shape)#
- Returns the volume of the dimensions. Throws if - d.nbDims < 0.
 - 
static Shape squeeze(Shape const &shape, SizeType32 dim)#
- Removes the given unit dimension from - shape.- Parameters:
- shape – The shape to squeeze. 
- dim – The dimension that should be removed (“squeezed”). 
 
- Returns:
- A new shape without the unit dimension. 
 
 - 
static Shape unsqueeze(Shape const &shape, SizeType32 dim)#
- Add a unit dimension to - shapeat the specified position.- Parameters:
- shape – The shape to unsqueeze. 
- dim – The dimension where unit dimension should be added. 
 
- Returns:
- A new shape with the added unit dimension. 
 
 - SharedPtr tensor,
- std::size_t offset,
- std::size_t size,
- Creates a sliced view on the underlying - tensor. The view will have the same data type as- tensor.- Parameters:
- tensor – The tensor to view. 
- offset – The offset of the view w.r.t. dimension 0 of the tensor. 
- size – The size of the view w.r.t. dimension 0 of the tensor. 
 
- Returns:
- A view on the - buffer.
 
 
 - 
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
 static inline UniqueConstPtr slice(
- TConstPtr &&tensor,
- std::size_t offset,
- std::size_t size,
 
 - 
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
 static inline UniqueConstPtr slice(
- TConstPtr &&tensor,
- std::size_t offset,
 
 - 
)#
- Parameters:
- offsetDims – The offset in multiple dimensions. 
- tensor – The tensor to view. 
- offsetDims – The offset dimensions of the view. 
- size – The size of the view w.r.t. the last dimension in offsetDims. 
- offsetDims – specifies all dimensions. 
 
- Throws:
- Whenever – offset overflows or the last dimension offset+size overflows. 
- Returns:
- A view of shape [size, the rest dimensions] or [size] when 
 
 
 - 
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
 static inline UniqueConstPtr slice( 
)#
 
 - 
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
 static inline UniqueConstPtr slice( 
)#
 
 - 
)#
- return the rest slices at the last dimension when - sizeomitted.
 
 - 
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
 static inline UniqueConstPtr slice( 
)#
 
 - 
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
 static inline UniqueConstPtr slice( 
)#
 
 - Parameters:
- offsetDims – specifies all dimensions. 
- Returns:
- Just the block at the point, with shape of [the rest dimensions] or [1] when 
 
 - 
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
 static inline UniqueConstPtr at( 
)#
 
 - 
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
 static inline ITensor::UniqueConstPtr at( 
)#
 
 - Returns a view on the underlying - buffer(or tensor) with the given shape.- Parameters:
- tensor – The tensor to view. 
- shape – The shape of the view. 
 
- Returns:
- A view on the - tensor.
 
 - 
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
 static inline UniqueConstPtr view( 
)#
 
 - Returns a view on the underlying - tensorwhich can be independently reshaped.- Parameters:
- tensor – The tensor to view. 
- Returns:
- A view on the - tensor.
 
 - SharedPtr tensor,
- std::int64_t sliceN = -1,
- Returns a flattened view on the underlying - tensorwhich can be independently reshaped.- Parameters:
- tensor – The tensor to flatten. 
- sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor. 
 
- Returns:
- A flatten view on the - tensor.
 
 
 - static UniquePtr wrap( )#
- Wraps the given - datain an- ITensor. The- ITensorwill not own the underlying- dataand cannot be reshaped beyond- capacity.- Parameters:
- data – The data to wrap. 
- type – The data type of the - data.
- shape – The shape of the tensor. 
- capacity – The capacity of the buffer. 
 
- Returns:
- An - ITensor.
 
 
 - static Shape makeShape(
- std::initializer_list<DimType64> const &dims,
- A convenience function to create a tensor shape with the given dimensions. 
 
 - 
static std::string toString(Shape const &dims)#
- A convenience function for converting a tensor shape to a - string.
 - 
static inline bool shapeEquals(Shape const &lhs, Shape const &rhs)#
- A convenience function to compare shapes. 
 - 
template<typename T>
 static inline bool shapeEquals(
- Shape const &lhs,
- T const *dims,
- SizeType32 count,
- A convenience function to compare shapes. 
 
 Protected Functions - 
ITensor() = default#
 Friends - friend class ITensorBindings
 
- 
~ITensor() override = default#
 
 
- 
namespace runtime
cudaEvent.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class CudaEvent#
- Public Types - 
using pointer = cudaEvent_t#
 - Public Functions - 
inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)#
- Creates a new cuda event. The event will be destroyed in the destructor. - Parameters:
- flags – Flags for event creation. By default, event timing is disabled. 
 
 - 
inline explicit CudaEvent(pointer event, bool ownsEvent = true)#
- Pass an existing cuda event to this object. - Parameters:
- event – The event to pass to this object. 
- ownsEvent – Whether this object owns the event and destroys it in the destructor. 
 
 
 - 
inline void synchronize() const#
- Synchronizes the event. 
 - Private Types - 
using EventPtr = std::unique_ptr<element_type, Deleter>#
 
- 
using pointer = cudaEvent_t#
 
- 
class CudaEvent#
 
- 
namespace runtime
virtualMemory.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- Functions - 
CudaVirtualMemoryManager &getVirtualMemoryManager()#
 - 
CudaVirtualMemoryAllocator getVirtualMemoryAllocator()#
 - std::string const &tag,
- CudaVirtualMemoryAllocator::RestoreMode mode,
- std::shared_ptr<CudaStream> backStream,
 
 - 
void clearVirtualMemoryAllocator()#
 - 
class CudaVirtualMemoryAllocator#
- Public Types - Public Functions - std::shared_ptr<Configuration> config,
 
 - 
inline explicit operator bool() const noexcept#
 - Private Members - 
std::shared_ptr<Configuration> mConfig#
 - 
class Configuration#
- Public Functions - inline Configuration(
- CudaVirtualMemoryManager &manager,
- std::string tag,
- RestoreMode mode,
- CudaStreamPtr backStream,
- CudaVirtualMemoryAllocator::Configuration - Parameters:
- manager – Manager used to track and manage virtual memories 
- tag – The tag for allocated memories 
- mode – Backed storage mode 
- backStream – The CUDA stream used for restoring memory content Note: Virtual Address Allocation is not async. The stream is not used in allocation. 
 
 
 
 - 
inline std::size_t pageAligned(std::size_t n) const noexcept#
 - Public Static Attributes - 
static Configuration backgroundConfiguration#
 - Private Functions - inline Configuration(
- CudaVirtualMemoryManager &manager,
- std::string tag,
- RestoreMode mode,
- CudaStreamPtr backStream,
- bool background,
 
 - Private Members - 
CudaVirtualMemoryManager &mManager#
 - 
std::string mTag#
 - 
CudaStreamPtr mBackStream#
 - 
std::size_t mPageSize#
 - 
RestoreMode mMode#
 - 
bool mBackground = {}#
 - Friends - friend class CudaVirtualMemoryAllocator
 - std::string const &tag,
- RestoreMode mode,
- std::shared_ptr<CudaStream> backStream,
 
 
 
 - 
class CUDAVirtualMemoryChunk#
- #include <virtualMemory.h>CUDAVirtualMemoryChunk is a handle to a piece of CUDA memory allocation, providing the ability to release and rematerialize the allocation. Public Types - 
enum Status#
- Values: - 
enumerator INVALID#
 - 
enumerator RELEASED#
 - 
enumerator MATERIALIZED#
 - 
enumerator ERRORED#
 
- 
enumerator INVALID#
 - 
using ConfiguratorPtr = std::unique_ptr<Configurator>#
 - 
using Configurators = std::vector<ConfiguratorPtr>#
 Public Functions - 
void materialize()#
- Materialize this CUDAVirtualMemoryChunk. Shall be called only when status() == RELEASED. - Calls creator.create(), and then configurator.setup() for each configurator in order. - Stop at the first thrown exception and propagates it. 
 - 
inline void release()#
- Release this CUDAVirtualMemoryChunk. Shall be called only when status() == MATERIALIZED, or materialize() throws. Will be called automatically by destructor if necessary. - Calls configurator.teardown() for each configurator that setup() succeed in materialize() in reversed order, and then creator.release(). - Never stops early upon exception. The last thrown exception will be propagated, and others logged. 
 - 
CUDAVirtualMemoryChunk(CUDAVirtualMemoryChunk const&) = delete#
 - CUDAVirtualMemoryChunk &operator=(
- CUDAVirtualMemoryChunk const&,
 
 - inline CUDAVirtualMemoryChunk(
- CUDAVirtualMemoryChunk &&other,
 
 - inline CUDAVirtualMemoryChunk &operator=(
- CUDAVirtualMemoryChunk &&other,
 
 - 
CUDAVirtualMemoryChunk() noexcept = default#
 - inline CUDAVirtualMemoryChunk(
- CreatorPtr &&creator,
- Configurators &&configurators,
 
 - 
inline virtual ~CUDAVirtualMemoryChunk()#
 - 
inline explicit operator bool() const noexcept#
- Test if this CUDAVirtualMemoryChunk is managing a memory block. 
 Private Functions - 
void _release(bool destructing)#
 Private Members - 
size_t mState = 0#
 - 
CUmemGenericAllocationHandle mHandle = {}#
 - 
std::vector<std::unique_ptr<Configurator>> mConfigurators#
 Private Static Attributes - 
static constexpr size_t INVALID_STATE = static_cast<size_t>(-1)#
 - 
struct Configurator#
- #include <virtualMemory.h>CUDAVirtualMemoryChunk::Configurator is the interface to configure a CUmemGenericAllocationHandle: - Map into virtual address 
- Bind to multicast object 
- Backup and restore memory content 
 Subclassed by tensorrt_llm::runtime::MemsetConfigurator, tensorrt_llm::runtime::MulticastConfigurator, tensorrt_llm::runtime::OffloadConfigurator, tensorrt_llm::runtime::UnicastConfigurator Public Functions - 
Configurator() = default#
 - 
virtual ~Configurator() = default#
 - 
Configurator(Configurator const&) = default#
 - 
Configurator &operator=(Configurator const&) = default#
 - 
Configurator(Configurator&&) = default#
 - 
Configurator &operator=(Configurator&&) = default#
 - 
virtual void setup(CUmemGenericAllocationHandle handle) = 0#
 - virtual void teardown(
- CUmemGenericAllocationHandle handle,
- bool destructing,
 
 
 - 
struct Creator#
- #include <virtualMemory.h>CUDAVirtualMemoryChunk::Creator is the interface to obtain a CUmemGenericAllocationHandle, either by creating one locally, or importing one from remote. Subclassed by tensorrt_llm::runtime::LocalCreator< count > 
 
- 
enum Status#
 - 
class CudaVirtualMemoryManager#
- Public Functions - void add(
- uintptr_t handle,
- std::string tag,
- CUDAVirtualMemoryChunk &&memory,
- Add memory to be managed by this manager. - The memory and internal state will remain valid if any exception is thrown. - Parameters:
- handle – Unique handle provided to reference this memory in - remove.
- tag – Tag the memory, so this memory can be targeted in - releaseWithTagand- materializeWithTag.
- memory – The CUDAVirtualMemory object. 
 
 
 
 - void add(
- uintptr_t handle,
- std::string tag,
- CUDAVirtualMemoryChunk::CreatorPtr &&creator,
- CUDAVirtualMemoryChunk::Configurators &&configurators,
- Creates and adds memory to be managed by this manager. The created memory is automatically materialized. - The internal state will remain valid if any exception is thrown. - Parameters:
- handle – Unique handle provided to reference this memory in - remove.
- tag – Tag the memory, so this memory can be targeted in - releaseWithTagand- materializeWithTag.
- creator – The creator for the memory. 
- configurators – The configurators for the memory. 
 
 
 
 - 
template<typename ...Configurators>
 inline void add(
- uintptr_t handle,
- std::string tag,
- CUDAVirtualMemoryChunk::CreatorPtr &&creator,
- Configurators&&... configurators,
 
 - 
CUDAVirtualMemoryChunk remove(uintptr_t handle) noexcept#
- Remove the memory from the manager. - Parameters:
- handle – The handle provided to - add.
- Returns:
- The CUDAVirtualMemory object. If the handle is unknown, an empty CUDAVirtualMemory will be returned. 
 
 - 
size_t releaseWithTag(std::string const &tag)#
- Call release for CUDAVirtualMemoryChunk - objects with a given tag. - This function will always call - CUDAVirtualMemoryChunk::releaseon all selected objects. The last exception thrown by- CUDAVirtualMemoryChunk::releasewill be rethrown, and others will be logged.- If any CUDAVirtualMemoryChunk threw an exception during - release, it will be removed from the manager. Call- retrieveBadHandlesto retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.- Parameters:
- tag – the tag to select target memories. 
- Returns:
- Number of objects selected. 
 
 - 
size_t materializeWithTag(std::string const &tag)#
- Call materialize for CUDAVirtualMemoryChunk - objects with a given tag. - This function will stop at the first - CUDAVirtualMemoryChunk::materializethat throws exception, and attempt to roll back previous successful- materializeby calling- release. The exception thrown by- CUDAVirtualMemoryChunk::materializewill be rethrown, and any exception thrown by- releasewill be logged.- If any CUDAVirtualMemoryChunk threw an exception during - materializeor- release, it will be removed from the manager. Successfully roll backed CUDAVirtualMemoryChunk will not be removed. Call- retrieveBadHandlesto retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.- Parameters:
- tag – the tag to select target memories. 
- Returns:
- Number of objects selected. 
 
 - 
std::vector<uintptr_t> retrieveBadHandles() noexcept#
- Retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception and reset the list. The returned list may not include all removed CUDAVirtualMemoryChunk handles if OOM happened. This method is only for diagnostic purpose, and should not be called concurrently with other methods. - Returns:
- The handle list. 
 
 - Private Types - 
using TagEntryMap = std::multimap<std::string, PointerMemoryMap::iterator>#
 - Private Functions - 
CUDAVirtualMemoryChunk unsafeRemove(uintptr_t handle) noexcept#
 - 
void addBadHandle(uintptr_t handle) noexcept#
 - Private Members - 
std::mutex mMutex#
 - 
PointerMemoryMap mMemories#
 - 
TagEntryMap mEntries#
 - 
std::vector<uintptr_t> mBadHandles#
 - friend VirtualMemoryManagerTest
 - 
struct Entry#
 
 - 
template<bool count = true>
 struct LocalCreator : public tensorrt_llm::runtime::CUDAVirtualMemoryChunk::Creator#
- #include <virtualMemory.h>LocalCreator creates memory allocation locally through cuMemCreate. 
 - 
struct MemsetConfigurator : public tensorrt_llm::runtime::CUDAVirtualMemoryChunk::Configurator#
- #include <virtualMemory.h>MemsetConfigurator fills the memory with given value. 
 - 
struct MulticastConfigurator : public tensorrt_llm::runtime::CUDAVirtualMemoryChunk::Configurator#
- #include <virtualMemory.h>MulticastConfigurator binds the allocation handle to the given multicast object and offset. 
 - 
struct OffloadConfigurator : public tensorrt_llm::runtime::CUDAVirtualMemoryChunk::Configurator#
- #include <virtualMemory.h>OffloadConfigurator offload the content of the allocation to the backup storage when teardown, and restore the content on the following setup. Public Functions - inline OffloadConfigurator(
- CUdeviceptr address,
- size_t size,
- MemoryType backType,
- CUstream stream,
- bool ondemand = false,
 
 - 
virtual void setup(CUmemGenericAllocationHandle handle) override#
 - virtual void teardown(
- CUmemGenericAllocationHandle handle,
- bool destructing,
 
 
 - 
struct UnicastConfigurator : public tensorrt_llm::runtime::CUDAVirtualMemoryChunk::Configurator#
- #include <virtualMemory.h>UnicastConfigurator maps the allocation handle into the specified unicast address range. 
 
- 
CudaVirtualMemoryManager &getVirtualMemoryManager()#
 
- 
namespace runtime
speculativeDecodingModule.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class SpeculativeDecodingModule#
- Subclassed by tensorrt_llm::runtime::EagleModule, tensorrt_llm::runtime::LookaheadModule, tensorrt_llm::runtime::MedusaModule - Public Functions - inline explicit SpeculativeDecodingModule(
- SizeType32 maxDraftPathLen,
- SizeType32 maxDecodingDraftTokens,
- SizeType32 maxNumPaths,
 
 - 
inline explicit SpeculativeDecodingModule() noexcept#
 - 
virtual ~SpeculativeDecodingModule() = default#
 - 
SpeculativeDecodingModule(SpeculativeDecodingModule const &o) = default#
 - SpeculativeDecodingModule &operator=(
- SpeculativeDecodingModule const &o,
 
 - 
inline SizeType32 getMaxDraftPathLen() const noexcept#
- Returns:
- max number of draft tokens that can be accepted by one step of the decoder 
 
 - 
inline SizeType32 getMaxPathLen() const noexcept#
- one more than draft path len for prediction from primary head - Returns:
- max number of tokens that a request can grow in one step of the decoder 
 
 - 
inline SizeType32 getMaxDecodingDraftTokens() const noexcept#
- Returns:
- max number of draft tokens processed by one step of the decoder 
 
 - 
inline SizeType32 getMaxDecodingTokens() const noexcept#
- one more than decoding draft tokens for prediction from primary head - Returns:
- max number of tokens processed by one step of the decoder 
 
 - 
inline SizeType32 getNumPackedMasks() const noexcept#
 - 
inline SizeType32 getMaxNumPaths() const noexcept#
 - 
inline void setMaxDraftTokens(SizeType32 maxDraftTokens) noexcept#
 - 
inline void setMaxDraftPathLen(SizeType32 maxDraftPathLen) noexcept#
 - 
inline void setMaxNumPaths(SizeType32 maxNumPaths) noexcept#
 - Private Functions - 
inline void computeNumPackedMasks() noexcept#
 - Private Members - 
SizeType32 mMaxDraftPathLen#
 - 
SizeType32 mMaxDecodingDraftTokens#
 - 
SizeType32 mMaxNumPaths#
 - 
SizeType32 mMaxNumPackedMasks#
 
 
- 
class SpeculativeDecodingModule#
 
- 
namespace runtime
common.h#
Defines
- 
FMT_DIM#
- 
namespace tensorrt_llm
- 
namespace runtime
- Typedefs - 
using SizeType32 = std::int32_t#
 - 
using SizeType64 = std::int64_t#
 - 
using TokenIdType = std::int32_t#
 - 
using LoraTaskIdType = std::uint64_t#
 - 
using TokenExtraIdType = std::uint64_t#
 - 
using VecTokenExtraIds = std::vector<TokenExtraIdType>#
 - 
using CacheSaltIDType = std::uint64_t#
 - 
using VecUniqueTokens = std::vector<UniqueToken>#
 - 
struct UniqueToken#
- Public Functions - 
inline bool operator==(UniqueToken const &other) const noexcept#
 
- 
inline bool operator==(UniqueToken const &other) const noexcept#
 
- 
using SizeType32 = std::int32_t#
 
- 
namespace runtime
samplingConfig.h#
Defines
- 
SET_FROM_OPTIONAL(varName, VarName, VarType)#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class SamplingConfig#
- Public Functions - 
inline explicit SamplingConfig(SizeType32 beamWidth = 1)#
 - inline explicit SamplingConfig(
- std::vector<SamplingConfig> const &configs,
 
 - inline explicit SamplingConfig(
- executor::SamplingConfig const &samplingConfig,
- std::optional<executor::ExternalDraftTokensConfig> const &externalDraftTokensConfig = std::nullopt,
 
 - 
inline bool validate()#
 - 
inline bool operator==(SamplingConfig const &other) const#
 - 
inline SizeType32 getNumReturnBeams() const#
 - 
inline SizeType32 getMaxBeamWidth() const noexcept#
 - Public Members - 
SizeType32 beamWidth#
 - 
std::optional<SizeType32> numReturnSequences#
 - 
OptVec<SizeType32> minLength#
 - 
OptVec<SizeType32> noRepeatNgramSize#
 - 
OptVec<SizeType32> topK#
 - 
OptVec<TokenIdType> topPResetIds#
 - 
OptVec<SizeType32> earlyStopping#
 - 
OptVec<std::vector<SizeType32>> beamWidthArray#
 - 
OptVec<std::vector<SizeType32>> topKMedusaHeads#
 - 
std::optional<bool> normalizeLogProbs#
 - Private Types - 
using FloatType = float#
 - Private Functions 
- 
inline explicit SamplingConfig(SizeType32 beamWidth = 1)#
 
- 
class SamplingConfig#
 
- 
namespace runtime
tllmLogger.h#
lookaheadModule.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule#
- Public Functions - inline explicit LookaheadModule(
- SizeType32 maxDraftPathLen,
- SizeType32 maxDecodingDraftTokens,
 
 - 
inline explicit LookaheadModule() noexcept#
 - inline void setExecutionConfig(
- executor::LookaheadDecodingConfig const &config,
 
 - inline executor::LookaheadDecodingConfig const &getExecutionConfig(
 
 - Private Members - 
executor::LookaheadDecodingConfig mExecutionConfig#
 
 
- 
class LookaheadModule : public tensorrt_llm::runtime::SpeculativeDecodingModule#
 
- 
namespace runtime
modelConfig.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class ModelConfig#
- Public Types - 
enum class ModelVariant : std::int32_t#
- Values: - 
enumerator kGpt#
 - 
enumerator kChatGlm#
 - 
enumerator kGlm#
 - 
enumerator kMamba#
 - 
enumerator kRecurrentGemma#
 - 
enumerator kEncDec#
 
- 
enumerator kGpt#
 - 
enum class LayerType : std::int32_t#
- Values: - 
enumerator kATTENTION#
 - 
enumerator kRECURRENT#
 - 
enumerator kLINEAR#
 - 
enumerator kNOOP#
 
- 
enumerator kATTENTION#
 - Public Functions - inline explicit ModelConfig(
- SizeType32 vocabSize,
- SizeType32 nbLayers,
- SizeType32 nbAttentionLayers,
- SizeType32 nbRnnLayers,
- SizeType32 nbHeads,
- SizeType32 hiddenSize,
- nvinfer1::DataType dtype,
 
 - 
inline SizeType32 constexpr getVocabSize() const noexcept#
 - inline SizeType32 constexpr getVocabSizePadded(
- SizeType32 worldSize,
 
 - inline SizeType32 countLocalLayers(
- LayerType layerType,
- SizeType32 pipelineParallelism = 1,
- SizeType32 pipelineParallelismRank = 0,
 
 - inline SizeType32 getFirstLocalLayer(
- SizeType32 pipelineParallelism = 1,
- SizeType32 pipelineParallelismRank = 0,
 
 - inline SizeType32 countLowerRankLayers(
- LayerType layerType,
- SizeType32 pipelineParallelism = 1,
- SizeType32 pipelineParallelismRank = 0,
 
 - inline SizeType32 getNbLayers(
- SizeType32 pipelineParallelism = 1,
- SizeType32 pipelineParallelismRank = 0,
 
 - inline SizeType32 getNbAttentionLayers(
- SizeType32 pipelineParallelism = 1,
- SizeType32 pipelineParallelismRank = 0,
 
 - inline SizeType32 getNbRnnLayers(
- SizeType32 pipelineParallelism = 1,
- SizeType32 pipelineParallelismRank = 0,
 
 - 
inline SizeType32 constexpr getNbHeads() const noexcept#
 - 
inline SizeType32 getNbKvHeads(SizeType32 layerIdx) const#
 - 
inline void setNbKvHeads(SizeType32 nbKvHeads)#
 - 
inline void setNbCrossKvHeads(SizeType32 nbKvHeads)#
 - 
inline SizeType32 constexpr getHiddenSize() const noexcept#
 - 
inline SizeType32 constexpr getEncoderHiddenSize() const noexcept#
 - inline void constexpr setEncoderHiddenSize(
- SizeType32 encoderHiddenSize,
 
 - 
inline SizeType32 constexpr getSizePerHead() const noexcept#
 - 
inline void constexpr setSizePerHead(SizeType32 sizePerHead) noexcept#
 - 
inline bool constexpr useGptAttentionPlugin() const noexcept#
 - 
inline bool constexpr useGemmAllReducePlugin() const noexcept#
 - inline void constexpr useGptAttentionPlugin(
- bool useGptAttentionPlugin,
 
 - inline void constexpr useGemmAllReducePlugin(
- bool useGemmAllReducePlugin,
 
 - 
inline bool constexpr useMambaConv1dPlugin() const noexcept#
 - inline void constexpr useMambaConv1dPlugin(
- bool useMambaConv1dPlugin,
 
 - 
inline bool constexpr usePackedInput() const noexcept#
 - 
inline void constexpr usePackedInput(bool inputPacked) noexcept#
 - 
inline bool constexpr usePagedState() const noexcept#
 - 
inline void constexpr usePagedState(bool pagedState) noexcept#
 - 
inline SizeType32 constexpr getTokensPerBlock() const noexcept#
 - inline void constexpr setTokensPerBlock(
- SizeType32 TokensPerBlock,
 
 - 
inline common::QuantMode constexpr getQuantMode() const noexcept#
 - inline void constexpr setQuantMode(
- common::QuantMode QuantMode,
 
 - 
inline bool constexpr supportsInflightBatching() const noexcept#
 - 
inline SizeType32 constexpr getMaxBatchSize() const noexcept#
 - inline void constexpr setMaxBatchSize(
- SizeType32 maxBatchSize,
 
 - 
inline SizeType32 constexpr getMaxBeamWidth() const noexcept#
 - inline void constexpr setMaxBeamWidth(
- SizeType32 maxBeamWidth,
 
 - 
inline SizeType32 constexpr getMaxInputLen() const noexcept#
 - 
inline void constexpr setMaxInputLen(SizeType32 maxInputLen) noexcept#
 - 
inline SizeType32 constexpr getMaxSequenceLen() const noexcept#
 - inline void constexpr setMaxSequenceLen(
- SizeType32 maxSequenceLen,
 
 - inline std::optional<SizeType32> constexpr getMaxNumTokens(
 
 - inline void constexpr setMaxNumTokens(
- std::optional<SizeType32> maxNumTokens,
 
 - 
inline SizeType32 constexpr getMaxEncoderLen() const noexcept#
 - inline void constexpr setMaxEncoderLen(
- SizeType32 maxEncoderLen,
 
 - 
inline bool constexpr usePromptTuning() const noexcept#
 - 
inline bool constexpr useMrope() const noexcept#
 - 
inline void constexpr setUseMrope(bool useMrope) noexcept#
 - 
inline SizeType32 constexpr getMaxPositionEmbeddings() const noexcept#
 - inline void constexpr setMaxPositionEmbeddings(
- SizeType32 maxPositionEmbeddings,
 
 - 
inline SizeType32 constexpr getRotaryEmbeddingDim() const noexcept#
 - inline void constexpr setRotaryEmbeddingDim(
- SizeType32 rotaryEmbeddingDim,
 
 - inline SizeType32 constexpr getMaxPromptEmbeddingTableSize(
 
 - inline void constexpr setMaxPromptEmbeddingTableSize(
- SizeType32 maxPromptEmbeddingTableSize,
 
 - 
inline bool constexpr computeContextLogits() const noexcept#
 - inline void constexpr computeContextLogits(
- bool computeContextLogits,
 
 - 
inline bool constexpr computeGenerationLogits() const noexcept#
 - inline void constexpr computeGenerationLogits(
- bool computeGenerationLogits,
 
 - 
inline ModelVariant getModelVariant() const#
 - 
inline void setModelVariant(ModelVariant modelVariant)#
 - 
inline SizeType32 getMaxDecodingDraftTokens() const#
 - 
inline SizeType32 constexpr getMaxDecodingTokens() const noexcept#
 - 
inline void constexpr setContextFMHA(bool contextFMHA) noexcept#
 - 
inline bool constexpr getContextFMHA() const noexcept#
 - inline void constexpr setPagedContextFMHA(
- bool pagedContextFMHA,
 
 - 
inline bool constexpr getPagedContextFMHA() const noexcept#
 - inline void constexpr setPpReduceScatter(
- bool ppReduceScatter,
 
 - 
inline bool constexpr getPpReduceScatter() const noexcept#
 - 
inline bool constexpr useLoraPlugin() const noexcept#
 - 
inline void constexpr useLoraPlugin(bool useLoraPlugin) noexcept#
 - inline std::vector<LoraModule> const &getLoraModules(
 
 - inline void setLoraModules(
- std::vector<LoraModule> const &loraModules,
 
 - 
inline SizeType32 constexpr getMlpHiddenSize() const noexcept#
 - inline void constexpr setMlpHiddenSize(
- SizeType32 mlpHiddenSize,
 
 - 
inline bool constexpr isKVCacheEnabled() const noexcept#
 - 
inline bool constexpr isPagedKVCache() const noexcept#
 - 
inline bool constexpr isContinuousKVCache() const noexcept#
 - 
inline KVCacheType constexpr getKVCacheType() const noexcept#
 - inline void constexpr setKVCacheType(
- KVCacheType kvCacheType,
 
 - 
inline bool constexpr useCrossAttention() const noexcept#
 - inline void constexpr setUseCrossAttention(
- bool useCrossAttention,
 
 - 
inline bool constexpr usePositionEmbedding() const noexcept#
 - inline void constexpr setUsePositionEmbedding(
- bool usePositionEmbedding,
 
 - 
inline bool constexpr useTokenTypeEmbedding() const noexcept#
 - inline void constexpr setUseTokenTypeEmbedding(
- bool useTokenTypeEmbedding,
 
 - 
inline SizeType32 constexpr getMaxLoraRank() const noexcept#
 - 
inline void constexpr setMaxLoraRank(SizeType32 maxLoraRank) noexcept#
 - inline void setSpeculativeDecodingMode(
- SpeculativeDecodingMode mode,
 
 - 
inline bool hasSpeculativeDecodingModule() const noexcept#
 - inline SpeculativeDecodingModule const &getSpeculativeDecodingModule(
 
 - inline std::shared_ptr<SpeculativeDecodingModule const> getSpeculativeDecodingModulePtr(
 
 - inline std::shared_ptr<SpeculativeDecodingModule> getSpeculativeDecodingModulePtr(
 
 - std::shared_ptr<SpeculativeDecodingModule> const &speculativeDecodingModule,
 
 - 
inline void resetSpeculativeDecodingModule() noexcept#
 - inline void enableSeamlessLookaheadDecoding(
- SizeType32 maxDraftTokens,
 
 - 
inline void disableSeamlessLookaheadDecoding() noexcept#
 - 
inline bool constexpr isTransformerBased() const noexcept#
 - 
inline bool hasRnnConfig() const noexcept#
 - 
inline bool constexpr isRnnBased() const noexcept#
 - inline SpeculativeDecodingMode constexpr getSpeculativeDecodingMode(
 
 - 
inline void setUseShapeInference(bool useShapeInference) noexcept#
 - 
inline bool useShapeInference() const noexcept#
 - 
inline ManageWeightsType getManageWeightsType() const noexcept#
 - inline void setManageWeightsType(
- ManageWeightsType const manageWeightType,
 
 - 
inline std::string const &getModelName() const noexcept#
 - 
inline void setModelName(std::string const &modelName)#
 - inline std::vector<SizeType32> const &getNumKvHeadsPerLayer(
 
 - inline std::vector<SizeType32> getNumKvHeadsForGivenLayers(
- std::vector<SizeType32> const &layers,
- bool isCrossAttention,
 
 - inline std::pair<std::vector<SizeType32>::const_iterator, std::vector<SizeType32>::const_iterator> getNumKvHeadsPerLayerLocalRange(
- SizeType32 pipelineParallelism = 1,
- SizeType32 pipelineParallelismRank = 0,
- bool isCrossAttention = false,
 
 - inline void setNumKvHeadsPerLayer(
- std::vector<SizeType32> const &headsPerLayer,
 
 - inline void setNumKvHeadsPerCrossLayer(
- std::vector<SizeType32> const &headsPerLayer,
 
 - 
inline bool constexpr skipCrossAttnBlocks() const noexcept#
 - inline void constexpr setSkipCrossAttnBlocks(
- bool skipCrossAttnBlocks,
 
 - inline std::optional<SizeType32> constexpr getNumLanguages(
 
 - 
inline bool constexpr useLanguageAdapter() const noexcept#
 - inline void constexpr setNumLanguages(
- std::optional<SizeType32> numLanguages,
 
 - 
inline bool isMultiModal() const#
 - 
inline bool isWhisper() const#
 - Public Static Functions - 
static inline KVCacheType KVCacheTypeFromString(std::string value)#
 - static inline std::vector<SizeType32> getOptProfilesSplitPoints(
 
 - Public Static Attributes - 
static constexpr std::array kOPT_PROFILES_SPLIT_POINTS = {64, 128, 256, 512, 1024}#
 - 
static constexpr SizeType32 kDEFAULT_NUM_TOKENS_PER_BLOCK = 64#
 - Private Members - 
SizeType32 mVocabSize#
 - 
SizeType32 mNbLayers#
 - 
SizeType32 mNbAttentionLayers#
 - 
SizeType32 mNbRnnLayers#
 - 
SizeType32 mNbHeads#
 - 
SizeType32 mHiddenSize#
 - 
SizeType32 mSizePerHead#
 - 
bool mUseGptAttentionPlugin#
 - 
bool mUseGemmAllReducePlugin#
 - 
bool mUseMambaConv1dPlugin#
 - 
bool mInputPacked#
 - 
bool mPagedState#
 - 
SizeType32 mTokensPerBlock#
 - 
common::QuantMode mQuantMode#
 - 
SizeType32 mMaxBatchSize#
 - 
SizeType32 mMaxBeamWidth#
 - 
SizeType32 mMaxInputLen#
 - 
SizeType32 mMaxSequenceLen#
 - 
std::optional<SizeType32> mMaxNumTokens#
 - 
bool mComputeContextLogits#
 - 
bool mComputeGenerationLogits#
 - 
ModelVariant mModelVariant#
 - 
SizeType32 mMaxPromptEmbeddingTableSize#
 - 
bool mUseMrope#
 - 
SizeType32 mMaxPositionEmbeddings#
 - 
SizeType32 mRotaryEmbeddingDim#
 - 
bool mContextFMHA#
 - 
bool mPagedContextFMHA#
 - 
bool mPpReduceScatter#
 - 
bool mUseLoraPlugin#
 - 
std::vector<LoraModule> mLoraModules#
 - 
SizeType32 mMlpHiddenSize#
 - 
SizeType32 mMaxLoraRank#
 - 
KVCacheType mKVCacheType = KVCacheType::kCONTINUOUS#
 - 
SizeType32 mMaxEncoderLen = {}#
 - 
SizeType32 mEncoderHiddenSize = {}#
 - 
bool mUseCrossAttention#
 - 
bool mUsePositionEmbedding#
 - 
bool mUseTokenTypeEmbedding#
 - 
std::shared_ptr<SpeculativeDecodingModule> mSpeculativeDecodingModule#
 - 
SpeculativeDecodingMode mSpeculativeDecodingMode#
 - 
bool mUseShapeInference#
 - 
ManageWeightsType mManageWeightsType#
 - 
std::string mModelName#
 - 
std::vector<SizeType32> mNumKvHeadsPerAttentionLayer#
 - 
std::vector<SizeType32> mNumKvHeadsPerCrossAttentionLayer#
 - 
bool mSkipCrossAttnBlocks#
 - 
std::optional<SizeType32> mNumLanguages#
 - 
struct RnnConfig#
- Public Members - 
SizeType32 stateSize = 0#
 - 
SizeType32 convKernel = 0#
 - 
SizeType32 rnnHiddenSize = 0#
 - 
SizeType32 rnnHeadSize = 0#
 - 
SizeType32 rnnConvDimSize = 0#
 
- 
SizeType32 stateSize = 0#
 
- 
enum class ModelVariant : std::int32_t#
 
- 
class ModelConfig#
 
- 
namespace runtime
iGptDecoderBatched.h#
- 
namespace tensorrt_llm
- 
namespace batch_manager
 - 
namespace runtime
- 
class IGptDecoderBatched#
- #include <iGptDecoderBatched.h>GPT decoder class with support for in-flight batching. Subclassed by tensorrt_llm::runtime::GptDecoderBatched Public Types - 
using CudaStreamPtr = std::shared_ptr<CudaStream>#
 - 
using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>#
 - 
using RequestVector = std::vector<LlmRequestPtr>#
 Public Functions - virtual void setup(
- executor::DecodingMode const &mode,
- SizeType32 maxNumSequences,
- SizeType32 maxBeamWidth,
- nvinfer1::DataType dtype,
- ModelConfig const &modelConfig,
- WorldConfig const &worldConfig,
- Setup the decoder before calling - forward()
 
 - virtual void disableLookahead(
- RequestVector const &genRequests,
- TensorPtr const &batchSlots,
- Disable Lookahead decoding. 
 
 - virtual CudaEvent forwardAsync(
- decoder::DecoderState const &decoderState,
- decoder_batch::Input const &input,
- Run one step for all requests without blocking the host process and return the token for synchronization. 
 
 - virtual void forward(
- decoder::DecoderState const &decoderState,
- decoder_batch::Input const &input,
- Run one step for all requests and wait for completion on the host. 
 
 - virtual CudaEvent finalize(
- decoder::DecoderState const &decoderState,
- SizeType32 batchSlot,
- SamplingConfig const &samplingConfig,
- bool streaming,
- Gather final beam search results for request - batchIdx. Result will only be available after event returned.
 
 
- 
using CudaStreamPtr = std::shared_ptr<CudaStream>#
 - 
namespace decoder#
 - 
namespace decoder_batch#
- 
class Input#
- 
Public Functions - inline explicit Input(
- std::vector<std::vector<TensorConstPtr>> const &logits,
- SizeType32 maxDecoderSteps,
 
 - 
inline explicit Input(std::vector<TensorConstPtr> const &logits)#
 Public Members - 
std::vector<std::vector<TensorConstPtr>> logits#
- [maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu - Mandatory parameters Logits 
 - 
SizeType32 maxDecoderSteps#
- Maximum number of decoding tokens of active slots. 
 
 
- 
class Input#
 
- 
class IGptDecoderBatched#
 
- 
namespace batch_manager
cudaStream.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class CudaStream#
- Public Functions - inline explicit CudaStream(
- unsigned int flags = cudaStreamNonBlocking,
- int priority = 0,
- Creates a new cuda stream on the current device. The stream will be destroyed in the destructor. - Parameters:
- flags – Flags for stream creation. See cudaStreamCreateWithFlags for a list of valid flags that can be passed. 
- priority – Priority of the stream. Lower numbers represent higher priorities. See cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed. 
 
 
 
 - inline explicit CudaStream(
- cudaStream_t stream,
- int device,
- bool ownsStream = true,
- Pass an existing cuda stream to this object. - Parameters:
- stream – The stream to pass to this object. 
- device – The device on which the stream was created. 
- ownsStream – Whether this object owns the stream and destroys it in the destructor. 
 
 
 
 - 
inline explicit CudaStream(cudaStream_t stream)#
- Construct with an existing cuda stream or the default stream by passing nullptr. 
 - 
inline int getDevice() const#
- Returns the device on which the stream was created. 
 - 
inline cudaStream_t get() const#
- Returns the stream associated with this object. 
 - 
inline void synchronize() const#
- Synchronizes the stream. 
 
 
- 
class CudaStream#
 
- 
namespace runtime
loraCache.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- Functions - 
std::string to_string(LoraCache::TaskLayerModuleConfig const &v)#
 - std::ostream &operator<<(
- std::ostream &os,
- LoraCache::TaskLayerModuleConfig const &v,
 
 - 
class LoraCache#
- #include <loraCache.h>Caches LoRA weights with LRU eviction policy. Tasks put in the cache are marked in progress and can not be evicted, until they are marked done. A cache page holds a optimally sized LoRA. A page is of size [numSlots x pageWidth] An optimally size LoRA is on that has the configured optimalAdapterSize. Conceptually a slot corresponds to a r=1, 1-layer, 1-module set of in/out weights. Page width is set to the number of weights in smallest module. The number of slots per page is then ceilDiv(num weights in optimally sized LoRA, num weights in smallest module) Cache pages are allocated on one or more blocks Public Types - 
using TaskIdType = std::uint64_t#
 - 
using TaskLayerModuleConfigListPtr = std::shared_ptr<std::vector<TaskLayerModuleConfig>>#
 Public Functions - LoraCache(
- LoraCachePageManagerConfig const &pageManagerConfig,
- ModelConfig const &modelConfig,
- WorldConfig const &worldConfig,
- BufferManager const &bufferManager,
- param[in] pageManagerConfig: a LoraCachePageManagerConfig param[in] modelConfig: a ModelConfig param[in] worldConfig: a WorldConfig param[in] bufferManager: a BufferManager only used to allocate page blocks 
 
 - void put(
- TaskIdType taskId,
- TensorPtr weights,
- TensorPtr config,
- bool load = true,
- put a task in the cache, and claim pages for it, and optionally load task weights. - Parameters:
- taskId – [in] the task id 
- weights – [in] lora weights tensor 
- config – [in] lora config tensor 
- load – [in] if true load weights before returning, otherwise do not 
 
 
 
 - void loadWeights(
- TaskIdType taskId,
- TensorPtr weights,
- TensorPtr config,
- load task weights. This method must be called after put. It is designed to be called asynchronously after put returns with load = false - Parameters:
- taslId – [in] the task id 
- weights – [in] lora weights tensor 
- config – [in] lora config tensor 
 
 
 
 - 
inline bool isLoaded(TaskIdType taskId) const#
- Parameters:
- taskId – [in] the task id 
- Returns:
- — true if task is loaded (weights are in place) and false otherwise 
 
 - 
bool isDone(TaskIdType taskId) const#
- Parameters:
- taskId – [in] the task id 
- Returns:
- — true if task is marked done and can be evicted 
 
 - 
inline bool has(TaskIdType taskId) const#
- Parameters:
- taskId – [in] the task id 
- Returns:
- — true if task is in the cache (not necessarily loaded) and false otherwise 
 
 - 
std::vector<TaskLayerModuleConfig> const &get(TaskIdType taskId)#
- Parameters:
- taskId – [in] the task id 
- Returns:
- — list of Value objects with pointers to task weights 
 
 - 
void bump(TaskIdType taskId)#
- bump task and make it the most recently used - Parameters:
- taskId – [in] the task id 
 
 - 
void markTaskDone(TaskIdType taskId)#
- mark task done meaning it can be evicted - Parameters:
- taskId – [in] the task id 
 
 - 
void markAllDone()#
- mark all tasks in cache done 
 - 
SizeType32 determineNumPages(TaskIdType taskId) const#
- Parameters:
- taskId – [in] the taskid 
- Returns:
- — number of pages needed to store the given task 
 
 - 
SizeType32 determineNumPages(TensorPtr config) const#
- Parameters:
- config – [in] lora config tensor 
- Returns:
- — number of pages needed to store the task configured with config tensor 
 
 - 
bool fits(TensorPtr config) const#
- Parameters:
- config – [in] a lora config tensor 
- Returns:
- — true in task fits in cache false otherwise 
 
 - void copyTask(
- TaskIdType taskId,
- LoraCache &deviceCache,
- bool markDone = false,
- copy task to another cache. Caches must have the same page size. - Parameters:
- taskId – [in] the task id to copy 
- otherCache – [in] the LoraCache to move the task to 
- markDone – [in] mark the copied task done as it’s copied 
 
 
 
 - 
SizeType32 getNumPages() const#
- Returns:
- — total number of pages allocated to cache (used or not) 
 
 - 
ITensor::SharedConstPtr getPagePtr(size_t pageId) const#
- Parameters:
- pageId – [in] the page id 
- Returns:
- — const pointer to page 
 
 Public Static Functions - static std::vector<LoraCache::TaskLayerModuleConfig> copyToPages(
- TensorPtr weights,
- TensorPtr config,
- ModelConfig const &modelConfig,
- WorldConfig const &worldConfig,
- std::unordered_map<SizeType32, LoraModule> moduleIdToModel,
- BufferManager const &manager,
- std::vector<TensorPtr> const &pages,
- std::vector<std::size_t> const &pageIds,
- Copy task weights to cache pages. - Parameters:
- weights – [in] task weights 
- config – [in] task config tensor 
- modelConfig – [in] a ModelConfig 
- worldConfig – [in] a WorldConfig 
- modelIdToModel – [in] map from lora module id to LoraModule 
- manager – [in] a BufferManager the manager to use to perform the copies 
- pages – [out] list of page tensors to copy weights to 
- pageIds – [in] page ids for the pages 
 
- Returns:
- — list of cache Values objects 
 
 
 - static void splitTransposeCpu(
- ITensor &output,
- ITensor const &input,
- SizeType32 tpSize,
- SizeType32 tpRank,
- splits second dim of input into tpSize parts and writes the tpRank split to output - Parameters:
- output – [out] output tensor 
- input – [in] input tensor 
- tpSize – [in] number of splits 
- tpRank – [in] the split to write to output 
 
 
 
 Private Types Private Functions - 
void bumpTaskInProgress(TaskIdType taskId)#
 - 
ValueStatus getStatus(TaskIdType taskId) const#
 - 
std::vector<std::size_t> claimPagesWithEvict(SizeType32 numPages)#
- claim numPages, evicting tasks if needed - Parameters:
- numPages – [in] number of pages to claim 
- Throws:
- std::runtime_error – if all pages cannot be claimed 
- Returns:
- — list of page ids 
 
 - std::map<size_t, std::pair<size_t, SizeType32>> copyTaskMapPages(
- TaskValue &targetTaskValue,
- TaskValue const &sourceTaskValue,
- std::vector<size_t> const &targetPageIds,
- LoraCache const &targetCache,
- Internal helper method used inside copyTask. Not thread safe on its own 
 
 Private Members - 
LoraCachePageManagerConfig mPageManagerConfig#
 - 
ModelConfig mModelConfig#
 - 
WorldConfig mWorldConfig#
 - 
mutable std::mutex mPagesMutex#
 - 
std::unique_ptr<LoraCachePageManager> mCachePageManager#
 - 
mutable std::mutex mCacheMutex#
 - 
std::unordered_map<TaskIdType, TaskValuePtr> mCacheMap#
 - 
std::list<TaskIdType> mInProgressTasks#
 - 
std::list<TaskIdType> mDoneTasks#
 - 
std::vector<std::unique_ptr<BufferManager>> mDeviceBufferManagers#
 - 
std::unique_ptr<BufferManager> mBufferManager#
 - 
std::unordered_map<SizeType32, LoraModule> mModuleIdToModule#
 Private Static Functions - 
template<typename T>
 static void splitTransposeCpuInner(
- ITensor &output,
- ITensor const &input,
- SizeType32 tpSize,
- SizeType32 tpRank,
 
 - 
struct TaskLayerModuleConfig#
- #include <loraCache.h>Contains information on a single layer / module. A list of these configs is associated with each task and can be used to populate runtime tensors. Public Functions - 
std::string toString() const#
 - 
bool operator==(LoraCache::TaskLayerModuleConfig const &o) const#
 Public Members - 
std::size_t pageId#
 - 
SizeType32 slotIdx#
 - 
SizeType32 inSize#
 - 
SizeType32 outSize#
 - 
SizeType32 moduleId#
 - 
SizeType32 layerId#
 - 
SizeType32 adapterSize#
 - 
SizeType32 numSlots#
 - 
std::int64_t weightsInPointer#
 - 
std::int64_t weightsOutPointer#
 - 
std::optional<std::int64_t> scalingVecPointer#
 Friends - friend class TaskLayerModuleConfigBindings
 
- 
std::string toString() const#
 - 
struct TaskValue#
- Holds configuration and state for a single task. - Public Functions - 
TaskValue() = delete#
 - 
~TaskValue() = default#
 - inline TaskValue(
- std::vector<std::size_t> const &pageIds,
- TaskLayerModuleConfigListPtr const &configs,
- std::list<TaskIdType>::iterator it,
- bool inProgress,
- bool loaded,
- bool done,
- bool loadInProgress = false,
 
 - Public Members - 
std::vector<std::size_t> pageIds#
 - 
TaskLayerModuleConfigListPtr configs#
 - 
std::list<TaskIdType>::iterator it#
 - 
bool inProgress#
 - 
bool loaded#
 - 
bool done#
- Marks a task a done. This is used to mark a task as done during loading. if done=true at the end of loading (end of put, loadweights, or copyTask) the task will be marked as done 
 - 
bool loadInProgress#
- Indicates weights are loading either in put or loadWeights This is used to block concurrent loadWeights calls for the same task. 
 
- 
TaskValue() = delete#
 
- 
using TaskIdType = std::uint64_t#
 - 
class LoraCacheFullException : public tensorrt_llm::runtime::LoraExpectedException#
 - 
class LoraCachePageManager#
- #include <loraCache.h>Holds memory of lora cache pages, and manages allocation and freeing of whole pages. Memory is pre-allocated either on the host or device Note that this class is not thread safe Public Functions - LoraCachePageManager(
- LoraCachePageManagerConfig const &config,
- BufferManager const &bufferManager,
- Parameters:
- config – [in] a LoraCachePageManagerConfig 
- bufferManager – [in] a Buffermanager used to allocate page blocks 
 
 
 
 - std::optional<std::vector<std::size_t>> claimPages(
- SizeType32 numPages,
- claim pages - Parameters:
- numPages – [in] number of pages to claim 
- Returns:
- a tuple, where the first values is a boolean indicating whether pages were claimed. If the first value is true the second value will have a list of pageIds 
 
 
 - 
SizeType32 numAvailablePages() const#
- get number of available (free) pages in manager - Returns:
- number of free pages in manager 
 
 - 
void releasePages(std::vector<std::size_t> const &pages)#
- release given pages - Parameters:
- pages – [in] list of pages to release (free) 
 
 - 
ITensor::SharedConstPtr blockPtr(SizeType32 blockIdx) const#
- return pointer to given page block - Parameters:
- blockIdx; – [in] 
- Returns:
- — pointer to page block 
 
 - 
ITensor::SharedConstPtr pagePtr(std::size_t pageIdx) const#
- return pointer to given page - Parameters:
- pageIdx – [in] 
- Returns:
- — const pointer to page 
 
 Private Functions - 
void initialize(BufferManager const &bufferManager)#
 
 - 
class LoraExpectedException : public std::runtime_error#
- Subclassed by tensorrt_llm::runtime::LoraCacheFullException 
 
- 
std::string to_string(LoraCache::TaskLayerModuleConfig const &v)#
 
- 
namespace runtime
medusaModule.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class MedusaModule : public tensorrt_llm::runtime::SpeculativeDecodingModule#
- Public Types - 
using MedusaChoices = std::vector<std::vector<SizeType32>>#
 - Public Functions - inline explicit MedusaModule(
- SizeType32 maxAcceptedTokens,
- SizeType32 maxDraftTokens,
 
 - 
inline explicit MedusaModule() noexcept#
 - 
inline MedusaChoices const &getMedusaChoices() const noexcept#
 - Private Members - 
MedusaChoices mDefaultMedusaChoices = {{0}, {0, 0}, {1}, {0, 1}, {2}, {0, 0, 0}, {1, 0}, {0, 2}, {3}, {0, 3}, {4}, {0, 4}, {2, 0}, {0, 5}, {0, 0, 1}, {5}, {0, 6}, {6}, {0, 7}, {0, 1, 0}, {1, 1}, {7}, {0, 8}, {0, 0, 2}, {3, 0}, {0, 9}, {8}, {9}, {1, 0, 0}, {0, 2, 0}, {1, 2}, {0, 0, 3}, {4, 0}, {2, 1}, {0, 0, 4}, {0, 0, 5}, {0, 0, 0, 0}, {0, 1, 1}, {0, 0, 6}, {0, 3, 0}, {5, 0}, {1, 3}, {0, 0, 7}, {0, 0, 8}, {0, 0, 9}, {6, 0}, {0, 4, 0}, {1, 4}, {7, 0}, {0, 1, 2}, {2, 0, 0}, {3, 1}, {2, 2}, {8, 0}, {0, 5, 0}, {1, 5}, {1, 0, 1}, {0, 2, 1}, {9, 0}, {0, 6, 0}, {0, 0, 0, 1}, {1, 6}, {0, 7, 0}}#
 
- 
using MedusaChoices = std::vector<std::vector<SizeType32>>#
 
- 
class MedusaModule : public tensorrt_llm::runtime::SpeculativeDecodingModule#
 
- 
namespace runtime
decoderState.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
namespace decoder
- 
class BeamSearchBuffers#
- Public Functions - 
explicit BeamSearchBuffers(BufferManager const &bufferManager)#
 - 
void reshape(SizeType32 maxBeamWidth, SizeType32 maxSequenceLength)#
 - Public Members - 
DecodingOutput::BeamHypotheses mOutputBeamHypotheses#
 - 
DecodingOutput::TensorPtr mCumLogProbsTmp#
 - 
SizeType32 mNumSMs#
 
- 
explicit BeamSearchBuffers(BufferManager const &bufferManager)#
 - 
class DecoderState#
- Public Types - 
using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>#
 - 
using RequestVector = std::vector<LlmRequestPtr>#
 - 
using DecodingInputPtr = std::unique_ptr<DecodingInput>#
 - 
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>#
 - Public Functions - 
DecoderState()#
 - void setup(
- SizeType32 maxNumSequences,
- SizeType32 maxBeamWidth,
- SizeType32 maxAttentionWindow,
- SizeType32 sinkTokenLength,
- SizeType32 maxSequenceLength,
- nvinfer1::DataType dtype,
- ModelConfig const &modelConfig,
- WorldConfig const &worldConfig,
- BufferManager const &bufferManager,
- Setup buffers for the decoder excluding speculative decoding. 
 
 - void setupCacheIndirection(
- SizeType32 maxNumSequences,
- SizeType32 maxBeamWidth,
- SizeType32 maxAttentionWindow,
- BufferManager const &bufferManager,
- Setup buffers for the cache indirection. - This is used for beam search on pipeline parallel ranks without a decoder. 
 
 - void setupSpeculativeDecoding(
- SpeculativeDecodingMode const &speculativeDecodingMode,
- SizeType32 maxTokensPerEngineStep,
- nvinfer1::DataType dtype,
- ModelConfig const &modelConfig,
- WorldConfig const &worldConfig,
- BufferManager const &bufferManager,
- Setup buffers for speculative decoding. 
 
 - 
void disableLookahead(RequestVector const &genRequests)#
- Disable lookahead decoding. 
 - 
TensorPtr getFinishedSum() const#
- Returns:
- [batchSize], number of finished sequences per request, on gpu 
 
 - 
TensorPtr getFinishReasons() const#
- Returns:
- [batchSize, beamWidth], finished states of type FinishedState, on gpu 
 
 - 
TensorPtr getIds() const#
- Returns:
- [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data. 
 
 - 
TensorPtr getIds(SizeType32 batchIdx) const#
- Parameters:
- batchIdx – index of the batch 
- Returns:
- [maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request - batchIdx, on gpu. In case of beam search, contains the ungathered data.
 
 - 
TensorPtr getGatheredIds() const#
- Returns:
- [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu. 
 
 - 
TensorPtr getGatheredIds(SizeType32 batchIdx) const#
- Parameters:
- batchIdx – index of the batch 
- Returns:
- [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request - batchIdx, on gpu.
 
 - 
TensorPtr getParentIds() const#
- Returns:
- [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu 
 
 - 
TensorPtr getCumLogProbs() const#
- Returns:
- [batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu 
 
 - 
TensorPtr getCumLogProbs(SizeType32 batchIdx) const#
- Returns:
- [maxBeamWidth], cumulative log probabilities (per beam), on gpu 
 
 - 
TensorPtr getLogProbs() const#
- Returns:
- [batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu 
 
 - 
TensorPtr getLogProbs(SizeType32 batchIdx) const#
- Returns:
- [maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu 
 
 - 
TensorPtr getSequenceLengths(SizeType32 batchIdx) const#
- Parameters:
- batchIdx – index of the batch 
- Returns:
- [maxBeamWidth], sequence lengths for request - batchIdx, on gpu
 
 - 
TensorPtr getAllNewTokens() const#
- Get maxTokensPerStep tokens generated in the last forward pass. - Returns:
- [maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu 
 
 - 
TensorPtr getNextDraftTokens() const#
- Returns:
- [batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu 
 
 - 
TensorPtr getPrevDraftTokensLengths() const#
- Returns:
- [batchSize], predicted draft tokens lengths for previous step, on gpu 
 
 - 
TensorPtr getNextDraftTokensLengths() const#
- Returns:
- [batchSize], predicted draft tokens lengths for next step, on gpu 
 
 - 
TensorPtr getAcceptedLengthsCumSum() const#
- Returns:
- [batchSize + 1], exclusive sum of accepted draft token lengths, on gpu 
 
 - 
TensorPtr getAcceptedPackedPaths() const#
- Returns:
- [batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu 
 
 - 
SizeType32 getMaxNumSequences() const#
 - 
SizeType32 getMaxBeamWidth() const#
 - 
SizeType32 getMaxSequenceLength() const#
 - 
SizeType32 getMaxDecodingDecoderTokens() const#
 - 
SizeType32 getMaxDecodingEngineTokens() const#
 - 
std::vector<SizeType32> const &getNumDecodingEngineTokens() const#
- Get the number of tokens for all requests in the batch. - Returns:
- The number of tokens for all requests in the batch. 
 
 - 
SizeType32 getNumDecodingEngineTokens(SizeType32 batchIdx) const#
- Get the number of tokens for a specific request in the batch. - Parameters:
- batchIdx – The index of the request in the batch. 
- Returns:
- The number of tokens for the specified request. 
 
 - void setNumDecodingEngineTokens(
- SizeType32 batchIdx,
- SizeType32 numTokens,
- Set the number of tokens for a specific request in the batch. - Parameters:
- batchIdx – The index of the request in the batch. 
- numTokens – The number of tokens for the specified request. 
 
 
 
 - 
SpeculativeDecodingMode getSpeculativeDecodingMode() const#
- Get the speculative decoding mode. 
 - ExplicitDraftTokensBuffers::Inputs const &getExplicitDraftTokensBuffers(
- Get the explicit draft tokens buffers. 
 
 - 
EagleBuffers::Inputs const &getEagleBuffers() const#
- Get the eagle buffers. 
 - 
LookaheadDecodingBuffers const &getLookaheadBuffers() const#
- Get the lookahead buffers. 
 - 
BeamSearchBuffers const &getBeamSearchBuffers() const#
- Workspace for beam search in streaming mode. 
 - 
void setBeamWidth(SizeType32 batchIdx, SizeType32 beamWidth)#
- Set the beam width for a specific request in the batch. - Parameters:
- batchIdx – The index of the request in the batch. 
- beamWidth – The beam width for the specified request. 
 
 
 - std::optional<std::vector<SizeType32>> const &getGenerationSteps(
- Get the generation steps for all requests in the batch. - Returns:
- The generation steps for all requests in the batch. 
 
 
 - void setGenerationSteps(
- std::vector<SizeType32> const &generationSteps,
- Set the generation steps for all requests in the batch. - Parameters:
- generationSteps – The generation steps for all requests in the batch. 
 
 
 - 
DecodingInput &getJointDecodingInput() const#
- Stateful inputs for the decoder. Allocated for maxNumSequences slots. 
 - 
DecodingOutput &getJointDecodingOutput() const#
- Stateful outputs for the decoder. Allocated for maxNumSequences slots. 
 - Private Functions - void setupBuffers(
- nvinfer1::DataType dtype,
- BufferManager const &bufferManager,
 
 - void reshapeBuffers(
- SizeType32 maxBatchSize,
- SizeType32 maxBeamWidth,
- SizeType32 maxAttentionWindow,
- SizeType32 sinkTokenLength,
- SizeType32 maxSequenceLength,
- ModelConfig const &modelConfig,
- WorldConfig const &worldConfig,
- BufferManager const &bufferManager,
 
 - 
void setupCacheIndirectionBuffers(BufferManager const &bufferManager)#
 - void reshapeCacheIndirectionBuffers(
- SizeType32 maxBatchSize,
- SizeType32 maxBeamWidth,
- SizeType32 maxAttentionWindow,
 
 - void setupSpeculativeDecodingBuffers(
- SpeculativeDecodingMode speculativeDecodingMode,
- nvinfer1::DataType dtype,
- BufferManager const &bufferManager,
 
 - void reshapeSpeculativeDecodingBuffers(
- SpeculativeDecodingMode const &speculativeDecodingMode,
- SizeType32 maxTokensPerEngineStep,
- ModelConfig const &modelConfig,
- WorldConfig const &worldConfig,
- BufferManager const &bufferManager,
 
 - Private Members - 
SizeType32 mMaxNumSequences = {}#
 - 
SizeType32 mMaxBeamWidth = {}#
 - 
SizeType32 mMaxSequenceLength = {}#
 - 
DecodingInputPtr mJointDecodingInput#
- Stateful inputs for the decoder. Allocated for maxNumSequences slots. 
 - 
DecodingOutputPtr mJointDecodingOutput#
- Stateful outputs for the decoder. Allocated for maxNumSequences slots. 
 - 
std::unique_ptr<BeamSearchBuffers> mBeamSearchBuffers#
- Workspace for beam search in streaming mode. 
 - 
SizeType32 mMaxDecodingDecoderTokens = {1}#
 - 
SizeType32 mMaxDecodingEngineTokens = {1}#
 - 
std::vector<SizeType32> mNumDecodingEngineTokens#
- [batchSize], the num tokens of each request. 
 - 
SpeculativeDecodingMode mSpeculativeDecodingMode = {SpeculativeDecodingMode::None()}#
 
- 
using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>#
 
- 
class BeamSearchBuffers#
 
- 
namespace decoder
 
- 
namespace runtime
lookaheadBuffers.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class LookaheadDecodingBuffers#
- 
Public Functions - LookaheadDecodingBuffers(
- SizeType32 maxNumSequences,
- SizeType32 maxTokensPerStep,
- BufferManager const &bufferManager,
 
 
 - 
class LookaheadRuntimeBuffers#
- 
Public Functions - LookaheadRuntimeBuffers(
- SizeType32 maxBatchSize,
- SizeType32 maxBeamWidth,
- BufferManager const &manager,
- ModelConfig const &modelConfig,
- WorldConfig const &worldConfig,
- executor::DecodingConfig const &decodingConfig,
- TllmRuntime const &runtime,
 
 - void setFromInputs(
- SizeType32 numCtxSequences,
- SizeType32 numGenSequences,
- ITensor const &requestTypes,
- ITensor const &seqSlots,
- LookaheadDecodingBuffers const &decoderLookaheadBuffers,
- TllmRuntime const &runtime,
- ModelConfig const &modelConfig,
- WorldConfig const &worldConfig,
 
 - void reshape(
- SizeType32 numCtxSequences,
- SizeType32 numGenSequences,
- SizeType32 tokensPerStep,
 
 - void insertInputTensors(
- TensorMap &inputBuffers,
- TensorMap &outputBuffers,
- WorldConfig const &worldConfig,
 
 - void enableLookaheadDecoding(
- SizeType32 maxBatchSize,
- SizeType32 tokensPerStep,
 
 - 
void disableLookaheadDecoding()#
 Public Members 
 
- 
class LookaheadDecodingBuffers#
 
- 
namespace runtime
eagleModule.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class EagleModule : public tensorrt_llm::runtime::SpeculativeDecodingModule#
- Public Functions - inline explicit EagleModule(
- SizeType32 maxDraftPathLen,
- SizeType32 maxDecodingDraftTokens,
- SizeType32 numTransformersLayer,
- SizeType32 maxNonLeafNodesPerLayer,
 
 - 
inline explicit EagleModule() noexcept#
 - inline executor::EagleChoices const &getDefaultEagleChoices(
 
 - 
inline SizeType32 getNumTransformerLayers() const noexcept#
 - 
inline SizeType32 getMaxNonLeafNodesPerLayer() const noexcept#
 - Private Members - 
SizeType32 mNumTransformersLayer#
 - 
SizeType32 mMaxNonLeafNodesPerLayer#
 - 
executor::EagleChoices mDefaultEagleChoices = {{0}, {0, 0}, {1}, {0, 1}, {2}, {0, 0, 0}, {1, 0}, {0, 2}, {3}, {0, 3}, {4}, {0, 4}, {2, 0}, {0, 5}, {0, 0, 1}, {5}, {0, 6}, {6}, {0, 7}, {0, 1, 0}, {1, 1}, {7}, {0, 8}, {0, 0, 2}, {3, 0}, {0, 9}, {8}, {9}, {1, 0, 0}, {0, 2, 0}, {1, 2}, {0, 0, 3}, {4, 0}, {2, 1}, {0, 0, 4}, {0, 0, 5}, {0, 0, 0, 0}, {0, 1, 1}, {0, 0, 6}, {0, 3, 0}, {5, 0}, {1, 3}, {0, 0, 7}, {0, 0, 8}, {0, 0, 9}, {6, 0}, {0, 4, 0}, {1, 4}, {7, 0}, {0, 1, 2}, {2, 0, 0}, {3, 1}, {2, 2}, {8, 0}, {0, 5, 0}, {1, 5}, {1, 0, 1}, {0, 2, 1}, {9, 0}, {0, 6, 0}, {0, 0, 0, 1}, {1, 6}, {0, 7, 0}}#
 
 
- 
class EagleModule : public tensorrt_llm::runtime::SpeculativeDecodingModule#
 
- 
namespace runtime
runtimeDefaults.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
struct RuntimeDefaults#
- Public Functions - inline RuntimeDefaults(
- std::optional<std::vector<SizeType32>> maxAttentionWindowVec,
- std::optional<SizeType32> sinkTokenLength,
 
 - 
RuntimeDefaults() = default#
 - Public Members - 
std::optional<std::vector<SizeType32>> maxAttentionWindowVec#
 - 
std::optional<SizeType32> sinkTokenLength#
 
 
- 
struct RuntimeDefaults#
 
- 
namespace runtime
decodingOutput.h#
- 
namespace tensorrt_llm
- 
namespace batch_manager
 - 
namespace runtime
- 
class DecodingOutput#
- 
Public Functions - 
DecodingOutput() = default#
 Public Members - 
TensorPtr ids#
- Mandatory parameters Previously generated token ids for all steps before DecodingInput.step, [BS, BM, MSL] 
 - 
TensorPtr gatheredIds#
- The tokens computed during the gatherTree step, [BS, BM, MSL] Necessary for “Streaming + Beam Search” mode since beam search kernels store ungathered tokens in - ids.
 - 
TensorPtr newTokensSteps#
- New tokens at each generated token of maxTokensPerStep, [maxTokensPerStep, BS, BM]. 
 - 
TensorPtr finishReasons#
- Optional parameters FinishedState by decoding if any of the stop conditions are met or if DecodingInput.finished is true, [BS, BM] 
 - 
TensorPtr logProbs#
- Mandatory parameters for Beam Search log-probility of generated tokens, [BS, BM, MSL], float 
 - 
BeamHypotheses beamHypotheses#
 - 
std::optional<SpeculativeDecodingOutputs> speculativeDecodingOutputs#
 - 
std::optional<ExplicitDraftTokensBuffers::Inputs> explicitDraftTokensBuffers#
 - 
std::optional<LookaheadDecodingBuffers> lookaheadOutputs#
 - 
std::optional<EagleBuffers::Inputs> eagleBuffers#
 Public Static Attributes - 
static float constexpr kNegativeInfinity = -1e20f#
 - 
class BeamHypotheses#
- Public Functions - 
void empty(BufferManager const &manager)#
 - void reshape(
- SizeType32 batchSize,
- SizeType32 beamWidth,
- SizeType32 maxSequenceLength,
 
 - 
void release()#
 - 
void init(BufferManager const &manager, TokenIdType endId)#
 - 
BeamHypotheses slice(SizeType32 batchIndex, SizeType32 size) const#
 
- 
void empty(BufferManager const &manager)#
 - 
class SpeculativeDecodingOutputs#
 
- 
DecodingOutput() = default#
 
- 
class DecodingOutput#
 
- 
namespace batch_manager
decodingInput.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class DecodingInput#
- #include <decodingInput.h>Represents the inputs to the decoder. This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such. Public Functions - 
DecodingInput() = default#
 Public Members - 
SizeType32 step = {}#
- Mandatory parameters The index of the decoding step we are on. Only used in Python runtime 
 - 
SizeType32 maxLength = {}#
- The maximum number of tokens to decode. 
 - 
SizeType32 maxAttentionWindow = {}#
- The maximum length of the attention window to consider while decoding. 
 - 
SizeType32 sinkTokenLength = {}#
- The number of tokens to use as attention sinks, https://arxiv.org/html/2309.17453v3. 
 - 
SizeType32 batchSize = {}#
- The number of samples in the batch. 
 - 
std::vector<SizeType32> beamWidths#
- The beam widths of each request, [batchSize]. 
 - 
SizeType32 maxStopWordsLen = {}#
- The maximum value in the - stopWordsLenstensor.
 - 
SizeType32 maxBadWordsLen = {}#
- The maximum value in the - badWordsLenstensor.
 - 
std::vector<TensorConstPtr> logitsVec#
- The output of the model forward computation, a probability distribution over the vocabulary [batchSize][numGenTokens, beamWidth, vocabSizePadded] on gpu 
 - 
TensorConstPtr endIds#
- The end ids, [batchSize * beamWidth] on gpu. 
 - 
TensorConstPtr batchSlots#
- Address map of the linear batch id to to the seq slots, [batchSize] on pinned, int32_t. 
 - 
TensorConstPtr finishReasons#
- Optional parameters Finished states at current iteration (skip decoding step of a request if true), [batchSize, beamWidth] on gpu 
 - 
TensorConstPtr sequenceLimitLength#
- The maximum sequence length for each sequence in the batch, [batchSize] on gpu. 
 - 
TensorConstPtr embeddingBias#
 - 
TensorConstPtr lengths#
 - 
TensorConstPtr badWordsPtrs#
 - 
TensorConstPtr badWordsLens#
 - 
TensorConstPtr stopWordsPtrs#
 - 
TensorConstPtr stopWordsLens#
 - 
TensorConstPtr noRepeatNgramSize#
 - 
TensorPtr cacheIndirection#
- Parameters for beam search KV cache index for beam search, [batchSize, beamWidth, maxSeqLen] on gpu 
 - 
std::optional<std::vector<SizeType32>> generationSteps#
- Steps of each request, for Variable-Beam-Width-Search, [batchSize]. 
 - 
std::optional<MedusaInputs> medusaInputs#
 - 
std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs#
 - 
std::optional<LookaheadInputs> lookaheadInputs#
 - 
std::optional<ExternalDraftTokensInputs> externalDraftTokensInputs#
 - 
std::optional<EagleInputs> eagleInputs#
 - 
struct EagleInputs#
- Public Members - 
TensorConstPtr nextDraftTokens#
 - 
TensorConstPtr nextDraftLens#
 - 
TensorConstPtr nextDraftPaths#
 - 
TensorConstPtr lastDraftTokens#
 - 
TensorConstPtr lastDraftLens#
 - 
TensorConstPtr lastDraftPaths#
 - 
TensorConstPtr acceptedTokens#
 - 
TensorConstPtr acceptedLens#
 - 
TensorConstPtr acceptedPathIds#
 - 
TensorConstPtr chunkedContextNextTokens#
 - 
TensorConstPtr seqSlots#
 
- 
TensorConstPtr nextDraftTokens#
 - 
class ExplicitDraftTokensInputs#
- Public Members - 
TensorConstPtr nextDraftTokens#
 - 
TensorConstPtr nextFlatTokens#
 - 
TensorConstPtr nextDraftIndices#
 - 
TensorConstPtr nextDraftProbs#
 - 
TensorConstPtr lastDraftTokens#
 - 
TensorConstPtr lastDraftIndices#
 - 
TensorConstPtr masks#
 - 
TensorConstPtr packedPositionIds#
 - 
TensorConstPtr bestPathLengths#
 - 
TensorConstPtr bestPathIndices#
 - 
TensorConstPtr nextGenerationLengths#
 - 
TensorConstPtr lastPositionIdsBase#
 - 
TensorConstPtr lastGenerationLengths#
 - 
TensorConstPtr maxGenLengthDevice#
 - 
TensorConstPtr seqSlots#
 
- 
TensorConstPtr nextDraftTokens#
 - 
class ExternalDraftTokensInputs#
 - 
struct LookaheadInputs#
 - 
class MedusaInputs#
- Public Members - 
TensorConstPtr medusaPaths#
- [batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu 
 - 
TensorConstPtr medusaTreeIds#
- [batchSize, maxTokensPerStep], on gpu 
 - 
std::vector<std::vector<TensorPtr>> medusaLogits#
- [batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu 
 - 
TensorConstPtr medusaTargetTokensPerStep#
- [batchSize], on gpu 
 
- 
TensorConstPtr medusaPaths#
 
- 
DecodingInput() = default#
 
- 
class DecodingInput#
 
- 
namespace runtime
worldConfig.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class WorldConfig#
- Public Functions - explicit WorldConfig(
- SizeType32 tensorParallelism = 1,
- SizeType32 pipelineParallelism = 1,
- SizeType32 contextParallelism = 1,
- SizeType32 rank = 0,
- SizeType32 gpusPerNode = kDefaultGpusPerNode,
- std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt,
- bool enableAttentionDP = false,
 
 - 
inline SizeType32 constexpr getSize() const noexcept#
 - 
inline SizeType32 constexpr getTensorParallelism() const noexcept#
 - 
inline bool constexpr isTensorParallel() const noexcept#
 - 
inline SizeType32 constexpr getPipelineParallelism() const noexcept#
 - 
inline bool constexpr isPipelineParallel() const noexcept#
 - 
inline SizeType32 constexpr getContextParallelism() const noexcept#
 - 
inline bool constexpr isContextParallel() const noexcept#
 - 
inline SizeType32 constexpr getRank() const noexcept#
 - 
inline SizeType32 constexpr getGpusPerNode() const noexcept#
 - 
inline SizeType32 getGpusPerGroup() const noexcept#
 - 
inline SizeType32 getDevice() const noexcept#
 - 
inline SizeType32 getDeviceOf(SizeType32 rank) const noexcept#
 - 
inline SizeType32 constexpr getPipelineParallelRank() const noexcept#
 - 
inline SizeType32 constexpr getTensorParallelRank() const noexcept#
 - 
inline SizeType32 constexpr getContextParallelRank() const noexcept#
 - 
inline SizeType32 constexpr getLocalRank() const noexcept#
 - 
inline SizeType32 constexpr getNodeRank() const noexcept#
 - inline SizeType32 constexpr getNodeRankOf(
- SizeType32 rank,
 
 - 
inline bool constexpr isFirstPipelineParallelRank() const noexcept#
 - 
inline bool constexpr isLastPipelineParallelRank() const noexcept#
- Is my rank the last rank in its pipeline? 
 - 
inline bool constexpr isFirstTensorParallelRank() const noexcept#
 - 
inline bool constexpr isFirstContextParallelRank() const noexcept#
 - 
inline SizeType32 constexpr getLastRank() const noexcept#
 - 
inline bool constexpr enableAttentionDP() const noexcept#
 - 
std::vector<SizeType32> getPipelineParallelGroup() const#
 - 
std::vector<SizeType32> getTensorParallelGroup() const#
 - 
std::vector<SizeType32> getContextParallelGroup() const#
 - 
bool validMpiConfig() const#
 - Public Static Functions - static WorldConfig mpi(
- SizeType32 gpusPerNode = kDefaultGpusPerNode,
- std::optional<SizeType32> tensorParallelism = std::nullopt,
- std::optional<SizeType32> pipelineParallelism = std::nullopt,
- std::optional<SizeType32> contextParallelism = std::nullopt,
- std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt,
- bool enableAttentionDP = false,
 
 - Public Static Attributes - 
static SizeType32 constexpr kDefaultGpusPerNode = 1#
 - Private Members - 
SizeType32 mTensorParallelism#
 - 
SizeType32 mPipelineParallelism#
 - 
SizeType32 mContextParallelism#
 - 
SizeType32 mRank#
 - 
SizeType32 mGpusPerNode#
 - 
bool mEnableAttentionDP#
 - 
std::vector<SizeType32> mDeviceIds#
 
 
- 
class WorldConfig#
 
- 
namespace runtime
gptDecoderBatched.h#
- 
namespace tensorrt_llm
- 
namespace batch_manager
 - 
namespace runtime
- 
class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched#
- #include <gptDecoderBatched.h>GPT decoder class with support for in-flight batching. Public Types - 
using CudaStreamPtr = std::shared_ptr<CudaStream>#
 - 
using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>#
 - 
using RequestVector = std::vector<LlmRequestPtr>#
 Public Functions - 
explicit GptDecoderBatched(CudaStreamPtr stream)#
 - virtual void setup(
- executor::DecodingMode const &mode,
- SizeType32 maxNumSequences,
- SizeType32 maxBeamWidth,
- nvinfer1::DataType dtype,
- ModelConfig const &modelConfig,
- WorldConfig const &worldConfig,
- Setup the decoder before calling - forward()
 
 - virtual void disableLookahead(
- RequestVector const &genRequests,
- TensorPtr const &batchSlots,
- Disable Lookahead decoding. 
 
 - virtual CudaEvent forwardAsync(
- decoder::DecoderState const &decoderState,
- decoder_batch::Input const &input,
- Run one step for all requests without blocking the host process and return the token for synchronization. 
 
 - virtual void forward(
- decoder::DecoderState const &decoderState,
- decoder_batch::Input const &input,
- Run one step for all requests and wait for completion on the host. 
 
 - virtual CudaEvent finalize(
- decoder::DecoderState const &decoderState,
- SizeType32 batchSlot,
- SamplingConfig const &samplingConfig,
- bool streaming,
- Gather final beam search results for request - batchSlot. Result will only be available after event returned.
 
 - 
inline CudaStreamPtr getDecoderStream() const#
 - 
inline IGptDecoder &getUnderlyingDecoder() const#
 - 
inline BufferManager const &getBufferManager() const#
 Private Types - 
using GptDecoderPtr = std::unique_ptr<IGptDecoder>#
 Private Functions - void forwardDispatch(
- decoder::DecoderState const &decoderState,
- decoder_batch::Input const &input,
- Calls decoders for tokens per engine step. 
 
 Private Members - 
CudaStreamPtr mRuntimeStream#
 - 
CudaStreamPtr mDecoderStream#
 - 
BufferManager mBufferManager#
 - 
GptDecoderPtr mDecoder#
 
- 
using CudaStreamPtr = std::shared_ptr<CudaStream>#
 
- 
class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched#
 
- 
namespace batch_manager
explicitDraftTokensBuffers.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class ExplicitDraftTokensBuffers#
- Public Types - 
using SizeType32 = runtime::SizeType32#
 - 
using TensorMap = runtime::StringPtrMap<runtime::ITensor>#
 - Public Functions - ExplicitDraftTokensBuffers(
- SizeType32 maxBatchSize,
- SizeType32 maxBeamWidth,
- runtime::BufferManager const &manager,
- runtime::ModelConfig const &modelConfig,
- runtime::WorldConfig const &worldConfig,
 
 - void reshape(
- SizeType32 numCtxSequences,
- SizeType32 numGenSequences,
- runtime::ModelConfig const &modelConfig,
 
 - void setFromInputs(
- SizeType32 numCtxSequences,
- SizeType32 numGenSequences,
- runtime::ITensor const &requestTypes,
- ITensor const &seqSlots,
- ExplicitDraftTokensBuffers::Inputs const &decoderBuffers,
- ITensor const &contextPositionIds,
- runtime::ModelConfig const &modelConfig,
- runtime::WorldConfig const &worldConfig,
- runtime::BufferManager const &manager,
- runtime::CudaStream const &stream,
 
 - void insertInputTensors(
- TensorMap &inputBuffers,
- TensorMap &outputBuffers,
- runtime::WorldConfig const &worldConfig,
 
 - Public Members - 
tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs engineInputs#
 - 
class tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs engineOutputs#
 - 
std::size_t scanTempStorageBytes = {0}#
 - Private Functions - 
template<typename T>
 void setFromInputs(
- SizeType32 numCtxSequences,
- SizeType32 numGenSequences,
- SizeType32 vocabSizePadded,
- ITensor const &seqSlots,
- ExplicitDraftTokensBuffers::Inputs const &draftBuffers,
- ITensor const &contextPositionIds,
- runtime::ExplicitDraftTokensModule const &explicitDraftTokensModule,
- runtime::CudaStream const &stream,
 
 - 
class EngineInputs : public tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs#
 - 
class EngineOutputs#
- Public Members 
 - 
class Inputs#
- Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs - Public Functions - void create(
- SizeType32 maxNumSequences,
- runtime::BufferManager const &manager,
- runtime::ModelConfig const &modelConfig,
- runtime::WorldConfig const &worldConfig,
 
 - Public Members - 
TensorPtr randomDataValidation#
- [maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen] 
 - 
TensorPtr draftTokens#
- [maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen] 
 - 
TensorPtr draftIndices#
- [maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen] 
 - 
TensorPtr draftProbs#
- [maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize] 
 
 
- 
using SizeType32 = runtime::SizeType32#
 
- 
class ExplicitDraftTokensBuffers#
 
- 
namespace runtime
bufferManager.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class BufferManager#
- #include <bufferManager.h>A helper class for managing memory on host and device. Public Types - 
using CudaStreamPtr = std::shared_ptr<CudaStream>#
 - 
using CudaMemPoolPtr = std::shared_ptr<CudaMemPool>#
 Public Functions - 
explicit BufferManager(CudaStreamPtr stream, bool trimPool = false)#
- Construct a BufferManager. - Parameters:
- cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.). 
 
 - 
inline ~BufferManager()#
- Destructor. 
 - IBufferPtr gpu(
- std::size_t size,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates an - IBufferof the given size on the GPU, using cudaMallocAsync.
 
 - ITensorPtr gpu(
- nvinfer1::Dims dims,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates an - ITensorof the given dimensions on the GPU, using cudaMallocAsync.
 
 - IBufferPtr allocate(
- MemoryType memoryType,
- std::size_t size,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates an - IBufferof the given size and memory type.
 
 - ITensorPtr allocate(
- MemoryType memoryType,
- nvinfer1::Dims dims,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates an - ITensorof the given dimensions and memory type.
 
 - inline IBufferPtr emptyBuffer(
- MemoryType memoryType,
- nvinfer1::DataType type = kBYTE_TYPE,
- Create an empty - IBufferof the given memory type. It may be resized later.
 
 - inline ITensorPtr emptyTensor(
- MemoryType memoryType,
- nvinfer1::DataType type = kBYTE_TYPE,
- Create an empty - ITensorof the given memory type. It may be reshaped later.
 
 - 
void copy(void const *src, IBuffer &dst, MemoryType srcType) const#
- Copy - srcto- dst.
 - 
void copy(IBuffer const &src, void *dst, MemoryType dstType) const#
- Copy - srcto- dst.
 - 
IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const#
- Copy - srcinto a new- IBufferwith a potentially different memory type.
 - 
ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const#
- Copy - srcinto a new- ITensorwith a potentially different memory type.
 - 
template<typename T>
 inline IBufferPtr copyFrom(
- std::vector<T> const &src,
- MemoryType memoryType,
- Copy - srcinto a new- IBufferwith a potentially different memory type.
 
 - 
template<typename T>
 inline ITensorPtr copyFrom(
- T *src,
- nvinfer1::Dims dims,
- MemoryType memoryType,
- Copy - srcinto a new- ITensorwith a potentially different memory type.
 
 - 
template<typename T>
 inline ITensorPtr copyFrom(
- std::vector<T> const &src,
- nvinfer1::Dims dims,
- MemoryType memoryType,
- Copy - srcinto a new- ITensorwith a potentially different memory type.
 
 - 
CudaStream const &getStream() const#
- Get the underlying cuda stream. 
 - 
std::size_t memoryPoolReserved() const#
- The current size of the memory reserved by the memory pool. 
 - 
std::size_t memoryPoolUsed() const#
- The current size of the memory used by the memory pool. 
 - 
std::size_t memoryPoolFree() const#
- The current size of the memory free in the memory pool. 
 - 
void memoryPoolTrimTo(std::size_t size)#
- Try to trim the memory reserved by the pool to - sizebytes. This synchronizes implicitly with the stream.
 Public Static Functions - static IBufferPtr gpuSync(
- std::size_t size,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates an - IBufferof the given size on the GPU, using cudaMalloc.
 
 - static ITensorPtr gpuSync(
- nvinfer1::Dims dims,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates an - ITensorof the given dimensions on the GPU, using cudaMalloc.
 
 - static IBufferPtr cpu(
- std::size_t size,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates an - IBufferof the given size on the CPU.
 
 - static ITensorPtr cpu(
- nvinfer1::Dims dims,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates an - ITensorof the given dimensions on the CPU.
 
 - static IBufferPtr pinned(
- std::size_t size,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates a pinned - IBufferof the given size on the CPU.
 
 - static ITensorPtr pinned(
- nvinfer1::Dims dims,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates a pinned - ITensorof the given dimensions on the CPU.
 
 - static IBufferPtr pinnedPool(
- std::size_t size,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates a pinned - IBufferof the given size on the CPU in the default memory pool.
 
 - static ITensorPtr pinnedPool(
- nvinfer1::Dims dims,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates a pinned - ITensorof the given dimensions on the CPU in the default memory pool.
 
 - static IBufferPtr managed(
- std::size_t size,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates an - IBufferof the given size in UVM.
 
 - static ITensorPtr managed(
- nvinfer1::Dims dims,
- nvinfer1::DataType type = kBYTE_TYPE,
- Allocates an - ITensorof the given dimensions in UVM.
 
 - static ITensorPtr ipcNvls( )#
- Allocates an - ITensorof the given dimensions for NVLS.
 
 Friends - friend class ::BufferManagerTest
 
- 
using CudaStreamPtr = std::shared_ptr<CudaStream>#
 
- 
class BufferManager#
 
- 
namespace runtime
loraModule.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- Functions - inline std::ostream &operator<<(
- std::ostream &output,
- LoraModule const &module,
 
 - 
class LoraModule#
- Public Types - 
enum class ModuleType : SizeType32#
- Values: - 
enumerator kINVALID#
 - 
enumerator kATTN_QKV#
 - 
enumerator kATTN_Q#
 - 
enumerator kATTN_K#
 - 
enumerator kATTN_V#
 - 
enumerator kATTN_DENSE#
 - 
enumerator kMLP_H_TO_4H#
 - 
enumerator kMLP_4H_TO_H#
 - 
enumerator kMLP_GATE#
 - 
enumerator kCROSS_ATTN_QKV#
 - 
enumerator kCROSS_ATTN_Q#
 - 
enumerator kCROSS_ATTN_K#
 - 
enumerator kCROSS_ATTN_V#
 - 
enumerator kCROSS_ATTN_DENSE#
 - 
enumerator kMOE_H_TO_4H#
 - 
enumerator kMOE_4H_TO_H#
 - 
enumerator kMOE_GATE#
 - 
enumerator kMOE_ROUTER#
 - 
enumerator kMLP_ROUTER#
 - 
enumerator kMLP_GATE_UP#
 
- 
enumerator kINVALID#
 - Public Functions - inline explicit constexpr LoraModule(
- ModuleType const &t,
- SizeType32 inDim,
- SizeType32 outDim,
- bool inDimFirst,
- bool outDimFirst,
- SizeType32 inTpSplitDim,
- SizeType32 outTpSplitDim,
 
 - 
inline explicit constexpr LoraModule() noexcept#
 - 
explicit constexpr LoraModule(LoraModule const &o) = default#
 - 
constexpr LoraModule &operator=(LoraModule const &o) = default#
 - inline SizeType32 constexpr flattenedInOutSize(
- SizeType32 adapterSize,
- bool isDora,
 
 - inline SizeType32 constexpr inSize(
- SizeType32 adapterSize,
 
 - inline SizeType32 constexpr outSize(
- SizeType32 adapterSize,
 
 - inline SizeType32 constexpr localInSize(
- SizeType32 adapterSize,
- SizeType32 tpSize,
 
 - inline SizeType32 constexpr localOutSize(
- SizeType32 adapterSize,
- SizeType32 tpSize,
 
 - inline SizeType32 constexpr localScalesSize(
- SizeType32 tpSize,
- bool isDora,
 
 - inline SizeType32 constexpr localInDim(
- SizeType32 tpSize,
 
 - inline SizeType32 constexpr localOutDim(
- SizeType32 tpSize,
 
 - inline SizeType32 constexpr localInAdapterSize(
- SizeType32 adapterSize,
- SizeType32 tpSize,
 
 - inline SizeType32 constexpr localOutAdapterSize(
- SizeType32 adapterSize,
- SizeType32 tpSize,
 
 - inline SizeType32 constexpr localInOutSize(
- SizeType32 adapterSize,
- SizeType32 tpSize,
 
 - inline SizeType32 constexpr localTotalSize(
- SizeType32 adapterSize,
- SizeType32 tpSize,
- bool isDora,
 
 - 
inline SizeType32 constexpr value() const noexcept#
 - 
inline std::string_view constexpr name() const noexcept#
 - 
inline SizeType32 constexpr inDim() const noexcept#
 - 
inline SizeType32 constexpr outDim() const noexcept#
 - 
inline bool constexpr inDimFirst() const noexcept#
 - 
inline bool constexpr outDimFirst() const noexcept#
 - 
inline SizeType32 constexpr inTpSplitDim() const noexcept#
 - 
inline SizeType32 constexpr outTpSplitDim() const noexcept#
 - Public Static Functions - static std::vector<LoraModule> createLoraModules(
- std::vector<std::string> const &loraModuleNames,
- SizeType32 hiddenSize,
- SizeType32 mlpHiddenSize,
- SizeType32 numAttentionHeads,
- SizeType32 numKvAttentionHeads,
- SizeType32 attentionHeadSize,
- SizeType32 tpSize,
- SizeType32 numExperts,
 
 - static inline ModuleType constexpr toModuleType(
- std::string_view const &name,
 
 - static inline std::string_view constexpr toModuleName(
- ModuleType t,
 
 - 
static inline std::string_view constexpr toModuleName(SizeType32 id)#
 - Private Members - 
ModuleType mType#
 - 
SizeType32 mInDim#
 - 
SizeType32 mOutDim#
 - 
bool mInDimFirst#
 - 
bool mOutDimFirst#
 - 
SizeType32 mInTpSplitDim#
 - 
SizeType32 mOutTpSplitDim#
 
- 
enum class ModuleType : SizeType32#
 
 
- 
namespace runtime
eagleBuffers.h#
- 
namespace tensorrt_llm
- 
namespace batch_manager
 - 
namespace runtime
- 
class EagleBuffers#
- Public Types - 
using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>#
 - 
using RequestVector = std::vector<LlmRequestPtr>#
 - 
using SizeType32 = runtime::SizeType32#
 - 
using TensorMap = runtime::StringPtrMap<runtime::ITensor>#
 - Public Functions - EagleBuffers(
- SizeType32 maxBatchSize,
- SizeType32 maxBeamWidth,
- runtime::BufferManager const &manager,
- runtime::ModelConfig const &modelConfig,
- runtime::WorldConfig const &worldConfig,
- executor::DecodingConfig const &decodingConfig,
 
 - void reshape(
- SizeType32 numCtxSequences,
- SizeType32 numGenSequences,
- runtime::ModelConfig const &modelConfig,
 
 - void setFromInputs(
- RequestVector const &contextRequests,
- RequestVector const &genRequests,
- runtime::ITensor const &requestTypes,
- ITensor const &seqSlots,
- EagleBuffers::Inputs const &decoderBuffers,
- runtime::BufferManager const &manager,
- runtime::ModelConfig const &modelConfig,
- runtime::WorldConfig const &worldConfig,
 
 - void insertInputTensors(
- TensorMap &inputBuffers,
- TensorMap &outputBuffers,
- runtime::WorldConfig const &worldConfig,
 
 - Public Members - 
class tensorrt_llm::runtime::EagleBuffers::EngineOutputs engineOutputs#
 - Private Functions - 
template<typename T>
 void setFromInputs(
- RequestVector const &contextRequests,
- RequestVector const &genRequests,
- SizeType32 vocabSizePadded,
- ITensor const &seqSlots,
- EagleBuffers::Inputs const &draftBuffers,
- runtime::EagleModule const &eagleModule,
- runtime::BufferManager const &manager,
 
 - Private Members - 
std::size_t scanReduceTempStorageBytes = {0}#
 - 
float mDefaultPosteriorThreshold = {0.09f}#
 - 
bool mDoGreedySampling = {true}#
 - 
class EngineOutputs#
- Public Members 
 - 
class Inputs#
- Public Functions - void create(
- SizeType32 maxNumSequences,
- BufferManager const &manager,
- ModelConfig const &modelConfig,
- WorldConfig const &worldConfig,
 
 - Public Members - 
TensorPtr randomDataValidation#
- [maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens] 
 - 
TensorPtr draftTokens#
- [maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens] 
 - 
TensorPtr draftPaths#
- [maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen] 
 - 
TensorPtr draftPathsHost#
- [maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen] 
 - 
TensorPtr specDecodingPackedMasks#
- [maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)] 
 - 
TensorPtr inputGenTokensHost#
- [maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens] 
 - 
TensorPtr prevScores#
- [maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens] 
 - 
TensorPtr currentExpandIndices#
- [maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens] 
 - 
TensorPtr allLayersScores#
- [maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] 
 
 
- 
using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>#
 
- 
class EagleBuffers#
 
- 
namespace batch_manager
speculativeDecodingMode.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class SpeculativeDecodingMode#
- Public Types - 
using UnderlyingType = std::uint8_t#
 - Public Functions - 
inline bool constexpr isNone() const#
 - 
inline bool constexpr isDraftTokensExternal() const#
 - 
inline bool constexpr isMedusa() const#
 - 
inline bool constexpr isLookaheadDecoding() const#
 - 
inline bool constexpr isExplicitDraftTokens() const#
 - 
inline bool constexpr isEagle() const#
 - 
inline bool constexpr updatesPositionIds() const#
 - 
inline bool constexpr requiresAttentionMask() const#
 - 
inline bool constexpr predictsDraftTokens() const#
 - 
inline bool constexpr needsKVCacheRewind() const#
 - 
inline bool constexpr variableDraftLength() const#
 - 
inline bool constexpr hasDraftLogits() const#
 - 
inline bool constexpr needsDecoderPrologue() const#
 - 
inline bool operator==(SpeculativeDecodingMode const &other) const#
 - inline explicit constexpr SpeculativeDecodingMode(
- UnderlyingType state,
 
 - Public Static Functions - 
static inline auto constexpr None()#
 - 
static inline auto constexpr DraftTokensExternal()#
 - 
static inline auto constexpr Medusa()#
 - 
static inline auto constexpr LookaheadDecoding()#
 - 
static inline auto constexpr ExplicitDraftTokens()#
 - 
static inline auto constexpr Eagle()#
 - Private Functions - 
inline bool constexpr anyBitSet(UnderlyingType bits) const#
 - 
inline bool constexpr allBitSet(UnderlyingType bits) const#
 - Private Members - 
UnderlyingType mState = {kNone}#
 - Private Static Attributes - 
static UnderlyingType constexpr kNone = {1U << 0U}#
 - 
static UnderlyingType constexpr kDraftTokensExternal = {1U << 1U}#
 - 
static UnderlyingType constexpr kMedusa = {1U << 2U}#
 - 
static UnderlyingType constexpr kLookaheadDecoding = {1U << 3U}#
 - 
static UnderlyingType constexpr kExplicitDraftTokens = {1U << 4U}#
 - 
static UnderlyingType constexpr kEagle = {1U << 5U}#
 
- 
using UnderlyingType = std::uint8_t#
 
- 
class SpeculativeDecodingMode#
 
- 
namespace runtime
promptTuningParams.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
template<typename TTensor>
 class GenericPromptTuningParams#
- 
Public Functions 
 - 
class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<ITensor::SharedPtr>#
- Public Types - 
using SizeType32 = GenericPromptTuningParams::SizeType32#
 - Public Functions - inline explicit PromptTuningParams( )#
 
 - void fillTasksTensor(
- TensorPtr tasksHost,
- SizeType32 batchSize,
- SizeType32 numContextRequests,
- std::vector<SizeType32> const &reqBeamWidths,
- std::vector<SizeType32> const &reqPromptLengths,
- BufferManager const &manager,
- bool packedInput,
 
 
- 
using SizeType32 = GenericPromptTuningParams::SizeType32#
 
- 
template<typename TTensor>
 
- 
namespace runtime
gptDecoder.h#
- 
namespace tensorrt_llm
- 
namespace layers#
 - 
namespace runtime
- Functions - inline runtime::ITensor::SharedConstPtr getDefaultBatchSlots(
- runtime::SizeType32 batchSize,
- Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder. 
 
 - 
template<typename T>
 class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder#
- Public Types - 
using CudaStreamPtr = BufferManager::CudaStreamPtr#
 - Public Functions - executor::DecodingMode const &mode,
- size_t maxNumSequences,
- size_t maxBeamWidth,
- size_t vocabSize,
- size_t vocabSizePadded,
- CudaStreamPtr const &stream,
- std::shared_ptr<SpeculativeDecodingModule const> speculativeDecodingModule = nullptr,
 
 - virtual void setup(
- SamplingConfig const &samplingConfig,
- size_t batchSize,
- TensorConstPtr const &batchSlots,
- std::optional<DecodingOutput> const &output = std::nullopt,
- std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
- std::optional<std::vector<TensorConstPtr>> const &lookaheadPrompt = std::nullopt,
- std::optional<std::vector<executor::LookaheadDecodingConfig>> const &lookaheadAlgoConfigs = std::nullopt,
- Parameters:
- explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder. 
 
 
 - virtual void forwardAsync(
- DecodingOutput &output,
- DecodingInput const &input,
 
 - virtual void forwardSync(
- DecodingOutput &output,
- DecodingInput const &input,
 
 - 
inline virtual SamplingConfig const &getSamplingConfig() override#
 - virtual void disableLookahead(
- std::optional<SamplingConfig> const &samplingConfig,
- SizeType32 batchSize,
- TensorConstPtr batchSlots,
 
 - Private Members - 
std::shared_ptr<BufferManager> mManager#
 - 
std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer#
 - 
std::shared_ptr<tensorrt_llm::runtime::DecodingLayerWorkspace> mDecodingLayerWorkspace#
 - 
SamplingConfig mSamplingConfig#
 - 
size_t mMaxNumSequences#
 - 
size_t mVocabSize#
 - 
size_t mVocabSizePadded#
 - 
executor::DecodingMode mDecodingMode#
 
- 
using CudaStreamPtr = BufferManager::CudaStreamPtr#
 - 
class IGptDecoder#
- Subclassed by tensorrt_llm::runtime::GptDecoder< T > - Public Types - 
using TensorConstPtr = runtime::ITensor::SharedConstPtr#
 - Public Functions - 
virtual ~IGptDecoder() = default#
 - virtual void setup(
- SamplingConfig const &samplingConfig,
- size_t batchSize,
- TensorConstPtr const &batchSlots,
- std::optional<DecodingOutput> const &output = std::nullopt,
- std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
- std::optional<std::vector<TensorConstPtr>> const &lookaheadPrompt = std::nullopt,
- std::optional<std::vector<executor::LookaheadDecodingConfig>> const &lookaheadAlgoConfigs = std::nullopt,
- Parameters:
- explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder. 
 
 
 - virtual void forwardAsync(
- DecodingOutput &output,
- DecodingInput const &input,
 
 - virtual void forwardSync(
- DecodingOutput &output,
- DecodingInput const &input,
 
 - 
virtual SamplingConfig const &getSamplingConfig() = 0#
 - virtual void disableLookahead(
- std::optional<SamplingConfig> const &samplingConfig,
- SizeType32 batchSize,
- TensorConstPtr batchSlots,
 
 - Public Static Functions - executor::DecodingMode const &mode,
- nvinfer1::DataType dtype,
- size_t maxNumSequences,
- size_t maxBeamWidth,
- size_t vocabSize,
- size_t vocabSizePadded,
- BufferManager::CudaStreamPtr const &stream,
- std::shared_ptr<SpeculativeDecodingModule const> const &speculativeDecodingModule = nullptr,
 
 
- 
using TensorConstPtr = runtime::ITensor::SharedConstPtr#
 
 
- 
namespace layers#
memoryCounters.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class MemoryCounters#
- 
Public Functions - 
MemoryCounters() = default#
 - 
inline SizeType32 getGpu() const#
 - 
inline SizeType32 getCpu() const#
 - 
inline SizeType32 getPinned() const#
 - 
inline SizeType32 getUVM() const#
 - 
inline SizeType32 getPinnedPool() const#
 - 
template<MemoryType T>
 inline void allocate(SizeType32 size)#
 - 
void allocate(MemoryType memoryType, SizeType32 size)#
 - 
template<MemoryType T>
 inline void deallocate(SizeType32 size)#
 - 
void deallocate(MemoryType memoryType, SizeType32 size)#
 - 
std::string toString() const#
 Public Static Functions - 
static MemoryCounters &getInstance()#
 - 
static std::string bytesToString(SizeType32 bytes, int precision = 2)#
 Private Members - 
std::atomic<SizeType32> mGpu = {}#
 - 
std::atomic<SizeType32> mCpu = {}#
 - 
std::atomic<SizeType32> mPinned = {}#
 - 
std::atomic<SizeType32> mUVM = {}#
 - 
std::atomic<SizeType32> mPinnedPool = {}#
 
- 
MemoryCounters() = default#
 
- 
class MemoryCounters#
 
- 
namespace runtime
ipcNvlsMemory.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- Functions - 
void MPI_group_barrier(std::set<int> ranks)#
 - 
bool ipcNvlsSupported()#
 - 
IpcNvlsHandle *ipcNvlsAllocate(size_t size, std::set<int> ranks)#
 - 
void ipcNvlsFree(IpcNvlsHandle *handle)#
 - 
template<typename T>
 class DeviceAllocationNvls#
 - 
struct IpcNvlsHandle#
- Public Members - 
size_t size = 0#
 - 
uintptr_t uc_ptr = 0#
 - 
uintptr_t mc_ptr = 0#
 - 
std::vector<uintptr_t> ipc_uc_ptrs#
 - 
CUdeviceptr uc_va#
 - 
CUdeviceptr mc_va#
 - 
std::vector<CUdeviceptr> ipc_uc_vas#
 - 
CUmemGenericAllocationHandle uc_handle#
 - 
CUmemGenericAllocationHandle mc_handle#
 - 
std::vector<CUmemGenericAllocationHandle> ipc_uc_handles#
 
- 
size_t size = 0#
 
- 
void MPI_group_barrier(std::set<int> ranks)#
 
- 
namespace runtime
rawEngine.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class RawEngine#
- Public Types - Public Functions - 
inline explicit RawEngine(std::filesystem::path enginePath) noexcept#
 - inline explicit RawEngine(
- void const *engineAddr,
- std::size_t engineSize,
 
 - 
inline std::filesystem::path getPath() const#
 - 
inline std::optional<std::filesystem::path> getPathOpt() const#
 - 
inline void setPath(std::filesystem::path enginePath)#
 - inline std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> const &getManagedWeightsMapOpt(
 
 - inline void setManagedWeightsMap(
- std::map<std::string, tensorrt_llm::executor::Tensor> managedWeightsMap,
 
 - 
inline void const *getAddress() const#
 - 
inline std::size_t getSize() const#
 
- 
inline explicit RawEngine(std::filesystem::path enginePath) noexcept#
 
- 
class RawEngine#
 
- 
namespace runtime
ipcUtils.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- Functions - void lamportInitializeAll(
- void *buffer_0,
- void *buffer_1,
- void *buffer_2,
- size_t size,
 
 - 
bool canAccessPeer(WorldConfig const &worldConfig)#
 - 
class AllReduceBuffers#
- 
Public Functions - AllReduceBuffers(
- SizeType32 maxBatchSize,
- SizeType32 maxBeamWidth,
- SizeType32 maxSequenceLength,
- SizeType32 hiddenSize,
- BufferManager const &manager,
- WorldConfig const &worldConfig,
- bool const fakeBuffers = false,
 
 
 - 
class IpcMemory#
- 
Public Functions - IpcMemory(
- std::size_t bufferSize,
- BufferManager const &manager,
- WorldConfig const &worldConfig,
- bool openIpc = true,
 
 - 
~IpcMemory()#
 - 
inline std::vector<void*> const &getCommPtrs() const#
 Public Static Attributes - 
static size_t constexpr FLAGS_SIZE = (tensorrt_llm::kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)#
 Private Functions - void allocateIpcMemory(
- std::size_t bufferSize,
- BufferManager const &manager,
- WorldConfig const &worldConfig,
 
 - 
void destroyIpcMemory()#
 
 
 
- 
namespace runtime
iBuffer.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- Typedefs - Enums - Functions - 
template<typename T>
 T const *bufferCast(IBuffer const &buffer)#
- Gets a typed pointer to the constant underlying data of the buffer. - Template Parameters:
- T – The type of the underlying data. 
- Parameters:
- buffer – The buffer to get a pointer to. 
- Returns:
- A pointer to constant - T.
 
 - 
template<typename T>
 T *bufferCast(IBuffer &buffer)#
- Gets a typed pointer to the underlying data of the buffer. - Template Parameters:
- T – The type of the underlying data. 
- Parameters:
- buffer – The buffer to get a pointer to. 
- Returns:
- A pointer to - T.
 
 - 
)#
- Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null. - Template Parameters:
- T – The type of the underlying data. 
- Parameters:
- bufferPtr – A possibly null shared ptr. 
- Returns:
- A pointer to T, possibly nullptr. 
 
 
 - IBuffer::SharedConstPtr const &bufferPtr,
- Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null. - Template Parameters:
- T – The type of the underlying data. 
- Parameters:
- bufferPtr – A possibly null shared ptr. 
- Returns:
- A pointer to const T, possibly nullptr. 
 
 
 - 
)#
- Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value. - Template Parameters:
- T – The type of the underlying data. 
- Parameters:
- optionalBufferPtr – A possibly empty optional. 
- Returns:
- A pointer to T, possibly nullptr. 
 
 
 - std::optional<IBuffer::SharedConstPtr> const &optionalBufferPtr,
- Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value. - Template Parameters:
- T – The type of the underlying data. 
- Parameters:
- optionalBufferPtr – A possibly empty optional. 
- Returns:
- A pointer to const T, possibly nullptr. 
 
 
 - 
class BufferDataType#
- #include <iBuffer.h>A wrapper around nvinfer1::DataTypethat provides a support for pointer types.Public Functions - inline constexpr BufferDataType(
- nvinfer1::DataType dataType,
- bool _unsigned = false,
- bool pointer = false,
 
 - 
inline constexpr bool isPointer() const noexcept#
 - 
inline constexpr bool isUnsigned() const#
 - 
inline constexpr std::size_t getSize() const noexcept#
 - 
inline constexpr std::size_t getSizeInBits() const noexcept#
 
 - 
template<typename T>
 class BufferRange : public tensorrt_llm::common::ArrayView<T>#
- Public Types - 
using Base = tensorrt_llm::common::ArrayView<T>#
 
- 
using Base = tensorrt_llm::common::ArrayView<T>#
 - 
template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false>
 struct DataTypeTraits#
- #include <iBuffer.h>For converting a TensorRT data type to a C++ data type. 
 - 
template<nvinfer1::DataType kDataType, bool kUnsigned>
 struct DataTypeTraits<kDataType, kUnsigned, true>#
 - 
template<bool kUnsigned>
 struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned>#
- Public Types - 
using type = bool#
 
- 
using type = bool#
 - 
template<>
 struct DataTypeTraits<nvinfer1::DataType::kINT32>#
- Public Types - 
using type = std::int32_t#
 
- 
using type = std::int32_t#
 - 
template<>
 struct DataTypeTraits<nvinfer1::DataType::kINT32, true>#
- Public Types - 
using type = std::uint32_t#
 
- 
using type = std::uint32_t#
 - 
template<>
 struct DataTypeTraits<nvinfer1::DataType::kINT64>#
- Public Types - 
using type = std::int64_t#
 
- 
using type = std::int64_t#
 - 
template<>
 struct DataTypeTraits<nvinfer1::DataType::kINT64, true>#
- Public Types - 
using type = std::uint64_t#
 
- 
using type = std::uint64_t#
 - 
template<bool kUnsigned>
 struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned>#
- Public Types - 
using type = std::uint8_t#
 
- 
using type = std::uint8_t#
 - 
class IBuffer#
- Subclassed by tensorrt_llm::runtime::ITensor - Public Types - Public Functions - 
virtual void *data() = 0#
- Returns a pointer to underlying array. 
 - 
virtual void const *data() const = 0#
- Returns a pointer to underlying array. 
 - 
inline virtual void *data(std::size_t index)#
- Returns a pointer to the underlying array at a given element index. 
 - 
inline virtual void const *data(std::size_t index) const#
- Returns a pointer to the underlying array at a given element index. 
 - 
virtual std::size_t getSize() const = 0#
- Returns the size (in number of elements) of the buffer. 
 - 
inline virtual std::size_t getSizeInBytes() const#
- Returns the size (in bytes) of the buffer. 
 - 
virtual std::size_t getCapacity() const = 0#
- Returns the capacity of the buffer. 
 - 
virtual char const *getDataTypeName() const#
 - 
virtual MemoryType getMemoryType() const = 0#
- Returns the memory type of the buffer. 
 - 
virtual char const *getMemoryTypeName() const#
 - 
virtual void resize(std::size_t newSize) = 0#
- Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity. 
 - 
virtual void release() = 0#
- Releases the buffer. It will be reset to nullptr. 
 - 
virtual ~IBuffer() = default#
 - Public Static Functions - SharedPtr buffer,
- std::size_t offset,
- std::size_t size,
- Creates a sliced view on the underlying - buffer. The view will have the same data type as- buffer.- Parameters:
- buffer – The buffer to view. 
- offset – The offset of the view. 
- size – The size of the view. 
 
- Returns:
- A view on the - buffer.
 
 
 - 
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
 static inline UniqueConstPtr slice(
- TConstPtr &&tensor,
- std::size_t offset,
- std::size_t size,
 
 - 
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
 static inline UniqueConstPtr slice(
- TConstPtr &&tensor,
- std::size_t offset,
 
 - Returns a view on the underlying - tensorwhich can be independently resized.- Parameters:
- tensor – The tensor to view. 
- Returns:
- A view on the - tensor.
 
 - Returns a view on the underlying - tensorwith a different size.- Parameters:
- tensor – The tensor to view. 
- size – The size of the view. 
 
- Returns:
- A view on the - tensor.
 
 - 
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
 static inline UniqueConstPtr view(
- TConstPtr &&tensor,
- std::size_t size,
 
 - static UniquePtr wrap(
- void *data,
- DataType type,
- std::size_t size,
- std::size_t capacity,
- Wraps the given - datain an- IBuffer. The- IBufferwill not own the underlying- dataand cannot be resized beyond- capacity.- Parameters:
- data – The data to wrap. 
- type – The data type of the - data.
- size – The size of the buffer. 
- capacity – The capacity of the buffer. 
 
- Returns:
- An - IBuffer.
 
 
 - 
template<typename T>
 static inline UniquePtr wrap(
- T *data,
- std::size_t size,
- std::size_t capacity,
 
 - 
static MemoryType memoryType(void const *data)#
- Determine the memory type of a pointer. 
 
- 
virtual void *data() = 0#
 - 
template<MemoryType T>
 struct MemoryTypeString#
 - 
template<>
 struct MemoryTypeString<MemoryType::kCPU>#
- Public Static Attributes - 
static auto constexpr value = "CPU"#
 
- 
static auto constexpr value = "CPU"#
 - 
template<>
 struct MemoryTypeString<MemoryType::kGPU>#
- Public Static Attributes - 
static auto constexpr value = "GPU"#
 
- 
static auto constexpr value = "GPU"#
 - 
template<>
 struct MemoryTypeString<MemoryType::kPINNED>#
- Public Static Attributes - 
static auto constexpr value = "PINNED"#
 
- 
static auto constexpr value = "PINNED"#
 - 
template<>
 struct MemoryTypeString<MemoryType::kPINNEDPOOL>#
- Public Static Attributes - 
static auto constexpr value = "PINNEDPOOL"#
 
- 
static auto constexpr value = "PINNEDPOOL"#
 - 
template<>
 struct MemoryTypeString<MemoryType::kUVM>#
- Public Static Attributes - 
static auto constexpr value = "UVM"#
 
- 
static auto constexpr value = "UVM"#
 - 
template<typename T, bool = false>
 struct TRTDataType#
- #include <iBuffer.h>For converting a C++ data type to a TensorRT data type. 
 - 
template<>
 struct TRTDataType<bool>#
 - 
template<>
 struct TRTDataType<float>#
 - 
template<>
 struct TRTDataType<half>#
 - 
template<>
 struct TRTDataType<kernels::FinishedState>#
- Public Static Attributes - 
static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value#
 
- 
static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value#
 - 
template<>
 struct TRTDataType<kernels::KVCacheIndex>#
- Public Static Attributes - 
static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value#
 
- 
static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value#
 - 
template<>
 struct TRTDataType<runtime::RequestType>#
- Public Static Attributes - 
static constexpr auto value = TRTDataType<std::underlying_type_t<runtime::RequestType>>::value#
 
- 
static constexpr auto value = TRTDataType<std::underlying_type_t<runtime::RequestType>>::value#
 - 
template<>
 struct TRTDataType<std::int32_t>#
 - 
template<>
 struct TRTDataType<std::int64_t>#
 - 
template<>
 struct TRTDataType<std::int8_t>#
 - 
template<>
 struct TRTDataType<std::uint32_t>#
- Public Static Attributes - 
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}#
 
- 
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}#
 - 
template<>
 struct TRTDataType<std::uint64_t>#
- Public Static Attributes - 
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}#
 
- 
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}#
 - 
template<>
 struct TRTDataType<std::uint8_t>#
 - 
template<typename T>
 struct TRTDataType<T*>#
- Public Static Attributes - 
static auto constexpr value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}#
 - Private Static Attributes - 
static auto constexpr kUnderlyingType = BufferDataType{TRTDataType<std::remove_const_t<T>, false>::value}#
 
- 
static auto constexpr value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}#
 - 
template<>
 struct TRTDataType<void*>#
- Public Static Attributes - 
static constexpr auto value = BufferDataType::kTrtPointerType#
 
- 
static constexpr auto value = BufferDataType::kTrtPointerType#
 
- 
template<typename T>
 
- 
namespace runtime
gptJsonConfig.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- 
class GptJsonConfig#
- Public Functions - inline GptJsonConfig(
- std::string name,
- std::string version,
- std::string precision,
- SizeType32 tensorParallelism,
- SizeType32 pipelineParallelism,
- SizeType32 contextParallelism,
- SizeType32 gpusPerNode,
- ModelConfig modelConfig,
- std::optional<RuntimeDefaults> runtimeDefaults = std::nullopt,
 
 - 
inline ModelConfig const &getModelConfig() const#
 - 
inline ModelConfig &getModelConfigMutable()#
 - 
inline std::string const &getName() const#
 - 
inline std::string const &getVersion() const#
 - 
inline std::string const &getPrecision() const#
 - 
inline SizeType32 constexpr getTensorParallelism() const#
 - 
inline SizeType32 constexpr getPipelineParallelism() const#
 - 
inline SizeType32 constexpr getContextParallelism() const#
 - 
inline SizeType32 constexpr getGpusPerNode() const#
 - 
inline SizeType32 constexpr getWorldSize() const#
 - 
inline std::optional<RuntimeDefaults> getRuntimeDefaults() const#
 - std::string engineFilename(
- WorldConfig const &worldConfig,
- std::string const &model,
 
 - inline std::string engineFilename(
- WorldConfig const &worldConfig,
 
 - Public Static Functions - 
static GptJsonConfig parse(std::string const &json)#
 - 
static GptJsonConfig parse(std::istream &json)#
 - 
static GptJsonConfig parse(std::filesystem::path const &path)#
 - Private Members - 
std::string const mName#
 - 
std::string const mVersion#
 - 
std::string const mPrecision#
 - 
SizeType32 const mTensorParallelism#
 - 
SizeType32 const mPipelineParallelism#
 - 
SizeType32 const mContextParallelism#
 - 
SizeType32 const mGpusPerNode#
 - 
ModelConfig mModelConfig#
 - 
std::optional<RuntimeDefaults> mRuntimeDefaults#
 
 
- 
class GptJsonConfig#
 
- 
namespace runtime
loraCachePageManagerConfig.h#
- 
namespace tensorrt_llm
- 
namespace runtime
- Functions - inline std::ostream &operator<<(
- std::ostream &os,
- LoraCachePageManagerConfig const &c,
 
 - 
inline std::string to_string(LoraCachePageManagerConfig const &c)#
 - 
class LoraCachePageManagerConfig#
- #include <loraCachePageManagerConfig.h>Configuration for LoraCachePageManager See LoraCache docs for description of pages, slots, and page blocks. Public Functions - inline explicit constexpr LoraCachePageManagerConfig(
- runtime::MemoryType memType,
- nvinfer1::DataType dType,
- SizeType32 totalNumPages,
- SizeType32 maxPagesPerBlock,
- SizeType32 slotsPerPage,
- SizeType32 pageWidth,
- SizeType32 numCopyStreams,
 
 - 
inline runtime::MemoryType constexpr getMemoryType() const noexcept#
 - inline void constexpr setMemoryType(
- runtime::MemoryType const &memoryType,
 
 - 
inline SizeType32 constexpr getTotalNumPages() const noexcept#
 - inline void constexpr setTotalNumPage(
- SizeType32 const &totalNumPages,
 
 - 
inline SizeType32 constexpr getMaxPagesPerBlock() const noexcept#
 - inline void constexpr setMaxPagesPerBlock(
- SizeType32 const &maxPagesPerBlock,
 
 - 
inline SizeType32 constexpr getSlotsPerPage() const noexcept#
 - inline void constexpr setSlotsPerPage(
- SizeType32 const &slotsPerPage,
 
 - 
inline SizeType32 constexpr getPageWidth() const noexcept#
 - inline void constexpr setPageWidth(
- SizeType32 const &pageWidth,
 
 - 
inline bool constexpr getInitToZero() const noexcept#
 - 
inline void constexpr setInitToZero(bool initToZero) noexcept#
 - 
inline SizeType32 constexpr getNumCopyStreams() const noexcept#
 - inline void constexpr setNumCopyStreams(
- SizeType32 numCopyStreams,
 
 Private Members - 
runtime::MemoryType mMemoryType#
 - 
SizeType32 mTotalNumPages#
 - 
SizeType32 mMaxPagesPerBlock#
 - 
SizeType32 mSlotsPerPage#
 - 
SizeType32 mPageWidth#
 - 
SizeType32 mNumCopyStreams = 1#
 - 
bool mInitToZero#
 
 
 
- 
namespace runtime