tensorrt_llm
Getting Started
Overview
Quick Start Guide
Release Notes
Installation
Installing on Linux
Building from Source Code on Linux
Installing on Windows
Building from Source Code on Windows
Architecture
TensorRT-LLM Architecture
Model Definition
Compilation
Runtime
Multi-GPU and Multi-Node Support
TensorRT-LLM Checkpoint
TensorRT-LLM Build Workflow
Adding a Model
Advanced
Multi-Head, Multi-Query, and Group-Query Attention
C++ GPT Runtime
Graph Rewriting Module
The Batch Manager in TensorRT-LLM
Inference Request
Run gpt-2b + LoRA using GptManager / cpp runtime
Expert Parallelism in TensorRT-LLM
Performance
Overview
Best Practices for Tuning the Performance of TensorRT-LLM
Performance Analysis
Reference
Troubleshooting
Support Matrix
Numerical Precision
Memory Usage of TensorRT-LLM
C++ API
Runtime
Python API
Layers
Functionals
Models
Plugin
Quantization
Runtime
Blogs
H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token
H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM
Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100
Speed up inference with SOTA quantization techniques in TRT-LLM
New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget
tensorrt_llm
Index
Index
A
|
B
|
C
|
D
|
E
|
F
|
G
|
H
|
I
|
K
|
L
|
M
|
N
|
O
|
P
|
Q
|
R
|
S
|
T
|
U
|
V
|
W
A
abs() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
activation() (in module tensorrt_llm.functional)
add() (in module tensorrt_llm.functional)
add_sequence() (tensorrt_llm.runtime.KVCacheManager method)
alibi (tensorrt_llm.functional.PositionEmbeddingType attribute)
alibi_with_scale (tensorrt_llm.functional.PositionEmbeddingType attribute)
allgather() (in module tensorrt_llm.functional)
allreduce() (in module tensorrt_llm.functional)
AllReduceConfig (class in tensorrt_llm.functional)
AllReduceStrategy (class in tensorrt_llm.functional)
apply_rotary_pos_emb() (tensorrt_llm.layers.attention.RopeEmbeddingUtils static method)
apply_rotary_pos_emb_chatglm() (tensorrt_llm.layers.attention.RopeEmbeddingUtils static method)
arange() (in module tensorrt_llm.functional)
argmax() (in module tensorrt_llm.functional)
assertion() (in module tensorrt_llm.functional)
Attention (class in tensorrt_llm.layers.attention)
AttentionMaskType (class in tensorrt_llm.functional)
AttentionParams (class in tensorrt_llm.layers.attention)
AUTO (tensorrt_llm.functional.AllReduceStrategy attribute)
avg_pool2d() (in module tensorrt_llm.functional)
AvgPool2d (class in tensorrt_llm.layers.pooling)
B
bad_words_list (tensorrt_llm.runtime.SamplingConfig attribute)
BaichuanForCausalLM (class in tensorrt_llm.models)
batch_size (tensorrt_llm.runtime.GenerationSession attribute)
beam_search_diversity_rate (tensorrt_llm.runtime.SamplingConfig attribute)
bert_attention() (in module tensorrt_llm.functional)
BertAttention (class in tensorrt_llm.layers.attention)
BertForQuestionAnswering (class in tensorrt_llm.models)
BertForSequenceClassification (class in tensorrt_llm.models)
BertModel (class in tensorrt_llm.models)
bidirectional (tensorrt_llm.functional.AttentionMaskType attribute)
bidirectionalglm (tensorrt_llm.functional.AttentionMaskType attribute)
BloomForCausalLM (class in tensorrt_llm.models)
BloomModel (class in tensorrt_llm.models)
broadcast_helper() (in module tensorrt_llm.functional)
buffer_allocated (tensorrt_llm.runtime.GenerationSession attribute)
C
Cast (class in tensorrt_llm.layers.cast)
cast() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
causal (tensorrt_llm.functional.AttentionMaskType attribute)
chatglm (tensorrt_llm.functional.PositionEmbeddingType attribute)
ChatGLMForCausalLM (class in tensorrt_llm.models)
ChatGLMGenerationSession (class in tensorrt_llm.runtime)
ChatGLMModel (class in tensorrt_llm.models)
check_config() (tensorrt_llm.models.ChatGLMForCausalLM method)
(tensorrt_llm.models.FalconForCausalLM method)
(tensorrt_llm.models.GemmaForCausalLM method)
(tensorrt_llm.models.GPTForCausalLM method)
(tensorrt_llm.models.GPTJForCausalLM method)
(tensorrt_llm.models.LLaMAForCausalLM method)
(tensorrt_llm.models.MPTForCausalLM method)
(tensorrt_llm.models.OPTForCausalLM method)
(tensorrt_llm.models.PhiForCausalLM method)
(tensorrt_llm.models.PretrainedModel method)
(tensorrt_llm.models.QWenForCausalLM method)
choices() (tensorrt_llm.functional.PositionEmbeddingType static method)
chunk() (in module tensorrt_llm.functional)
clip() (in module tensorrt_llm.functional)
ColumnLinear (in module tensorrt_llm.layers.linear)
compute_relative_bias() (in module tensorrt_llm.layers.attention)
concat() (in module tensorrt_llm.functional)
conditional() (in module tensorrt_llm.functional)
constant() (in module tensorrt_llm.functional)
constant_to_tensor_() (in module tensorrt_llm.functional)
context (tensorrt_llm.runtime.Session property)
Conv1d (class in tensorrt_llm.layers.conv)
conv1d() (in module tensorrt_llm.functional)
Conv2d (class in tensorrt_llm.layers.conv)
conv2d() (in module tensorrt_llm.functional)
conv_transpose2d() (in module tensorrt_llm.functional)
convert_hf_checkpoint() (tensorrt_llm.models.PhiForCausalLM class method)
ConvTranspose2d (class in tensorrt_llm.layers.conv)
cos() (in module tensorrt_llm.functional)
create_sinusoidal_positions() (tensorrt_llm.layers.attention.RopeEmbeddingUtils static method)
cross_attention (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
cuda_graph_mode (tensorrt_llm.runtime.GenerationSession attribute)
cuda_stream_guard() (tensorrt_llm.runtime.GenerationSession method)
cumsum() (in module tensorrt_llm.functional)
D
debug_mode (tensorrt_llm.runtime.GenerationSession attribute)
debug_tensors_to_save (tensorrt_llm.runtime.GenerationSession attribute)
decode() (tensorrt_llm.runtime.GenerationSession method)
decode_batch() (tensorrt_llm.runtime.GenerationSession method)
decode_regular() (tensorrt_llm.runtime.GenerationSession method)
decode_stream() (tensorrt_llm.runtime.GenerationSession method)
DecoderModel (class in tensorrt_llm.models)
default_plugin_config() (tensorrt_llm.models.LLaMAForCausalLM method)
device (tensorrt_llm.runtime.GenerationSession attribute)
DimRange (class in tensorrt_llm.functional)
div() (in module tensorrt_llm.functional)
dtype (tensorrt_llm.functional.Tensor property)
(tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
(tensorrt_llm.runtime.ModelRunner property)
(tensorrt_llm.runtime.ModelRunnerCpp property)
(tensorrt_llm.runtime.TensorInfo attribute)
dump_debug_buffers() (tensorrt_llm.runtime.GenerationSession method)
dynamic (tensorrt_llm.functional.RotaryScalingType attribute)
E
early_stop_criteria() (tensorrt_llm.runtime.GenerationSession method)
early_stopping (tensorrt_llm.runtime.SamplingConfig attribute)
einsum() (in module tensorrt_llm.functional)
elementwise_binary() (in module tensorrt_llm.functional)
Embedding (class in tensorrt_llm.layers.embedding)
embedding() (in module tensorrt_llm.functional)
EncoderModel (class in tensorrt_llm.models)
end_id (tensorrt_llm.runtime.SamplingConfig attribute)
engine (tensorrt_llm.runtime.Session property)
eq() (in module tensorrt_llm.functional)
exp() (in module tensorrt_llm.functional)
expand() (in module tensorrt_llm.functional)
expand_dims() (in module tensorrt_llm.functional)
expand_dims_like() (in module tensorrt_llm.functional)
expand_mask() (in module tensorrt_llm.functional)
F
FalconForCausalLM (class in tensorrt_llm.models)
FalconModel (class in tensorrt_llm.models)
fill_none_tensor_list() (tensorrt_llm.layers.attention.KeyValueCacheParams method)
filter_medusa_logits() (tensorrt_llm.runtime.GenerationSession method)
finalize_decoder() (tensorrt_llm.runtime.GenerationSession method)
find_best_medusa_path() (tensorrt_llm.runtime.GenerationSession method)
first_layer (tensorrt_llm.runtime.GenerationSession property)
flip() (in module tensorrt_llm.functional)
forward() (tensorrt_llm.layers.activation.Mish method)
(tensorrt_llm.layers.attention.Attention method)
(tensorrt_llm.layers.attention.BertAttention method)
(tensorrt_llm.layers.cast.Cast method)
(tensorrt_llm.layers.conv.Conv1d method)
(tensorrt_llm.layers.conv.Conv2d method)
(tensorrt_llm.layers.conv.ConvTranspose2d method)
(tensorrt_llm.layers.embedding.Embedding method)
(tensorrt_llm.layers.embedding.PromptTuningEmbedding method)
(tensorrt_llm.layers.linear.Linear method)
(tensorrt_llm.layers.linear.RowLinear method)
(tensorrt_llm.layers.mlp.FusedGatedMLP method)
(tensorrt_llm.layers.mlp.GatedMLP method)
(tensorrt_llm.layers.mlp.MLP method)
(tensorrt_llm.layers.normalization.GroupNorm method)
(tensorrt_llm.layers.normalization.LayerNorm method)
(tensorrt_llm.layers.normalization.RmsNorm method)
(tensorrt_llm.layers.pooling.AvgPool2d method)
(tensorrt_llm.models.BertForQuestionAnswering method)
(tensorrt_llm.models.BertForSequenceClassification method)
(tensorrt_llm.models.BertModel method)
(tensorrt_llm.models.BloomModel method)
(tensorrt_llm.models.ChatGLMModel method)
(tensorrt_llm.models.DecoderModel method)
(tensorrt_llm.models.EncoderModel method)
(tensorrt_llm.models.FalconModel method)
(tensorrt_llm.models.GPTJModel method)
(tensorrt_llm.models.GPTModel method)
(tensorrt_llm.models.GPTNeoXModel method)
(tensorrt_llm.models.LLaMAModel method)
(tensorrt_llm.models.MambaLMHeadModel method)
(tensorrt_llm.models.MedusaForCausalLm method)
(tensorrt_llm.models.MPTModel method)
(tensorrt_llm.models.OPTModel method)
(tensorrt_llm.models.PhiModel method)
(tensorrt_llm.models.WhisperEncoder method)
frequency_penalty (tensorrt_llm.runtime.SamplingConfig attribute)
from_checkpoint() (tensorrt_llm.models.PretrainedModel class method)
from_config() (tensorrt_llm.models.PretrainedModel class method)
from_dict() (tensorrt_llm.models.PretrainedConfig class method)
from_dir() (tensorrt_llm.runtime.ModelRunner class method)
(tensorrt_llm.runtime.ModelRunnerCpp class method)
from_engine() (tensorrt_llm.runtime.ModelRunner class method)
(tensorrt_llm.runtime.Session static method)
from_hugging_face() (tensorrt_llm.models.GemmaForCausalLM class method)
(tensorrt_llm.models.LLaMAForCausalLM class method)
from_json_file() (tensorrt_llm.models.PretrainedConfig class method)
from_meta_ckpt() (tensorrt_llm.models.LLaMAForCausalLM class method)
from_serialized_engine() (tensorrt_llm.runtime.Session static method)
from_string() (tensorrt_llm.functional.PositionEmbeddingType static method)
FusedGatedMLP (class in tensorrt_llm.layers.mlp)
(tensorrt_llm.functional.MLPType attribute)
G
GatedMLP (class in tensorrt_llm.layers.mlp)
(tensorrt_llm.functional.MLPType attribute)
gather() (in module tensorrt_llm.functional)
gather_context_logits (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
(tensorrt_llm.runtime.ModelRunner property)
(tensorrt_llm.runtime.ModelRunnerCpp property)
gather_generation_logits (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
(tensorrt_llm.runtime.ModelRunner property)
(tensorrt_llm.runtime.ModelRunnerCpp property)
gather_last_token_logits() (in module tensorrt_llm.functional)
geglu() (in module tensorrt_llm.functional)
gelu() (in module tensorrt_llm.functional)
GemmaForCausalLM (class in tensorrt_llm.models)
generate() (tensorrt_llm.runtime.ModelRunner method)
(tensorrt_llm.runtime.ModelRunnerCpp method)
(tensorrt_llm.runtime.QWenForCausalLMGenerationSession method)
generate_alibi_biases() (in module tensorrt_llm.functional)
generate_alibi_slopes() (in module tensorrt_llm.functional)
GenerationSequence (class in tensorrt_llm.runtime)
GenerationSession (class in tensorrt_llm.runtime)
get_batch_idx() (tensorrt_llm.runtime.GenerationSequence method)
get_block_pointers() (tensorrt_llm.runtime.KVCacheManager method)
get_first_past_key_value() (tensorrt_llm.layers.attention.KeyValueCacheParams method)
get_next_medusa_tokens() (tensorrt_llm.runtime.GenerationSession method)
get_parent() (tensorrt_llm.functional.Tensor method)
get_seq_idx() (tensorrt_llm.runtime.GenerationSequence method)
get_users() (tensorrt_llm.functional.Tensor method)
gpt_attention() (in module tensorrt_llm.functional)
gpt_attention_plugin (tensorrt_llm.runtime.ModelConfig attribute)
GPTForCausalLM (class in tensorrt_llm.models)
GPTJForCausalLM (class in tensorrt_llm.models)
GPTJModel (class in tensorrt_llm.models)
GPTModel (class in tensorrt_llm.models)
GPTNeoXForCausalLM (class in tensorrt_llm.models)
GPTNeoXModel (class in tensorrt_llm.models)
group_norm() (in module tensorrt_llm.functional)
GroupNorm (class in tensorrt_llm.layers.normalization)
(tensorrt_llm.functional.LayerNormType attribute)
gt() (in module tensorrt_llm.functional)
H
handle_per_step() (tensorrt_llm.runtime.GenerationSession method)
has_position_embedding (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
has_token_type_embedding (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
head_size (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
hidden_size (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
(tensorrt_llm.runtime.ModelRunner property)
(tensorrt_llm.runtime.ModelRunnerCpp property)
I
identity() (in module tensorrt_llm.functional)
index_select() (in module tensorrt_llm.functional)
infer_shapes() (tensorrt_llm.runtime.Session method)
interpolate() (in module tensorrt_llm.functional)
is_alibi() (tensorrt_llm.functional.PositionEmbeddingType method)
is_dynamic() (tensorrt_llm.functional.Tensor method)
is_gated_activation() (in module tensorrt_llm.functional)
is_medusa_mode (tensorrt_llm.runtime.GenerationSession property)
is_rope() (tensorrt_llm.functional.PositionEmbeddingType method)
is_trt_wrapper() (tensorrt_llm.functional.Tensor method)
is_valid() (tensorrt_llm.layers.attention.AttentionParams method)
(tensorrt_llm.layers.attention.KeyValueCacheParams method)
is_valid_cross_attn() (tensorrt_llm.layers.attention.AttentionParams method)
K
KeyValueCacheParams (class in tensorrt_llm.layers.attention)
KVCacheManager (class in tensorrt_llm.runtime)
L
last_layer (tensorrt_llm.runtime.GenerationSession property)
layer_norm() (in module tensorrt_llm.functional)
LayerNorm (class in tensorrt_llm.layers.normalization)
(tensorrt_llm.functional.LayerNormType attribute)
LayerNormPositionType (class in tensorrt_llm.functional)
LayerNormType (class in tensorrt_llm.functional)
learned_absolute (tensorrt_llm.functional.PositionEmbeddingType attribute)
length_penalty (tensorrt_llm.runtime.SamplingConfig attribute)
Linear (class in tensorrt_llm.layers.linear)
linear (tensorrt_llm.functional.RotaryScalingType attribute)
LLaMAForCausalLM (class in tensorrt_llm.models)
LLaMAModel (class in tensorrt_llm.models)
load() (tensorrt_llm.models.PretrainedModel method)
load_partial_weights() (tensorrt_llm.models.PretrainedModel method)
location (tensorrt_llm.functional.Tensor property)
log() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
LogitsProcessor (class in tensorrt_llm.runtime)
LogitsProcessorList (class in tensorrt_llm.runtime)
lora_plugin (tensorrt_llm.runtime.ModelConfig attribute)
lora_plugin() (in module tensorrt_llm.functional)
lora_target_modules (tensorrt_llm.runtime.ModelConfig attribute)
lt() (in module tensorrt_llm.functional)
M
make_causal_mask() (in module tensorrt_llm.layers.attention)
mamba_conv1d() (in module tensorrt_llm.functional)
mamba_conv1d_plugin (tensorrt_llm.runtime.ModelConfig attribute)
mamba_d_conv (tensorrt_llm.runtime.MambaLMHeadModelGenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
mamba_d_state (tensorrt_llm.runtime.MambaLMHeadModelGenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
mamba_expand (tensorrt_llm.runtime.MambaLMHeadModelGenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
MambaLMHeadModel (class in tensorrt_llm.models)
MambaLMHeadModelGenerationSession (class in tensorrt_llm.runtime)
mapping (tensorrt_llm.runtime.GenerationSession attribute)
mark_output() (tensorrt_llm.functional.Tensor method)
masked_scatter() (in module tensorrt_llm.functional)
masked_select() (in module tensorrt_llm.functional)
matmul() (in module tensorrt_llm.functional)
max() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
max_attention_window_size (tensorrt_llm.runtime.SamplingConfig attribute)
max_batch_size (tensorrt_llm.runtime.ModelConfig attribute)
max_beam_width (tensorrt_llm.runtime.ModelConfig attribute)
max_medusa_tokens (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
max_new_tokens (tensorrt_llm.runtime.SamplingConfig attribute)
max_prompt_embedding_table_size (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
(tensorrt_llm.runtime.ModelRunner property)
(tensorrt_llm.runtime.ModelRunnerCpp property)
max_sequence_length (tensorrt_llm.runtime.ModelRunner property)
(tensorrt_llm.runtime.ModelRunnerCpp property)
maximum() (in module tensorrt_llm.functional)
mean() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
medusa_paths (tensorrt_llm.runtime.GenerationSession attribute)
medusa_position_offsets (tensorrt_llm.runtime.GenerationSession attribute)
medusa_temperature (tensorrt_llm.runtime.GenerationSession attribute)
medusa_topks (tensorrt_llm.runtime.GenerationSession attribute)
medusa_tree_ids (tensorrt_llm.runtime.GenerationSession attribute)
MedusaForCausalLm (class in tensorrt_llm.models)
min_length (tensorrt_llm.runtime.SamplingConfig attribute)
minimum() (in module tensorrt_llm.functional)
Mish (class in tensorrt_llm.layers.activation)
MLP (class in tensorrt_llm.layers.mlp)
(tensorrt_llm.functional.MLPType attribute)
MLPType (class in tensorrt_llm.functional)
model_name (tensorrt_llm.runtime.ModelConfig attribute)
ModelConfig (class in tensorrt_llm.runtime)
ModelRunner (class in tensorrt_llm.runtime)
ModelRunnerCpp (class in tensorrt_llm.runtime)
module
tensorrt_llm
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
tensorrt_llm.functional
tensorrt_llm.layers.activation
tensorrt_llm.layers.attention
tensorrt_llm.layers.cast
tensorrt_llm.layers.conv
tensorrt_llm.layers.embedding
tensorrt_llm.layers.linear
tensorrt_llm.layers.mlp
tensorrt_llm.layers.normalization
tensorrt_llm.layers.pooling
tensorrt_llm.models
tensorrt_llm.plugin
tensorrt_llm.quantization
tensorrt_llm.runtime
MPTForCausalLM (class in tensorrt_llm.models)
MPTModel (class in tensorrt_llm.models)
mul() (in module tensorrt_llm.functional)
multiply_gather() (tensorrt_llm.layers.linear.Linear method)
multiply_reduce() (tensorrt_llm.layers.linear.RowLinear method)
N
name (tensorrt_llm.functional.Tensor property)
(tensorrt_llm.runtime.TensorInfo attribute)
NCCL (tensorrt_llm.functional.AllReduceStrategy attribute)
ndim() (tensorrt_llm.functional.Tensor method)
network (tensorrt_llm.functional.Tensor property)
next_medusa_input_ids() (tensorrt_llm.runtime.GenerationSession method)
non_gated_version() (in module tensorrt_llm.functional)
none (tensorrt_llm.functional.RotaryScalingType attribute)
num_beams (tensorrt_llm.runtime.SamplingConfig attribute)
num_heads (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
(tensorrt_llm.runtime.ModelRunner property)
(tensorrt_llm.runtime.ModelRunnerCpp property)
num_heads_kv (tensorrt_llm.runtime.GenerationSession property)
num_kv_heads (tensorrt_llm.runtime.ModelConfig attribute)
num_layers (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
(tensorrt_llm.runtime.ModelRunner property)
(tensorrt_llm.runtime.ModelRunnerCpp property)
num_medusa_heads (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
num_medusa_tokens (tensorrt_llm.runtime.GenerationSession attribute)
nvinfer1 (C++ type)
O
ONESHOT (tensorrt_llm.functional.AllReduceStrategy attribute)
op_and() (in module tensorrt_llm.functional)
op_or() (in module tensorrt_llm.functional)
OPTForCausalLM (class in tensorrt_llm.models)
OPTModel (class in tensorrt_llm.models)
outer() (in module tensorrt_llm.functional)
output_cum_log_probs (tensorrt_llm.runtime.SamplingConfig attribute)
output_log_probs (tensorrt_llm.runtime.SamplingConfig attribute)
output_sequence_lengths (tensorrt_llm.runtime.SamplingConfig attribute)
P
pad_id (tensorrt_llm.runtime.SamplingConfig attribute)
padding (tensorrt_llm.functional.AttentionMaskType attribute)
paged_kv_cache (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
paged_state (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
ParallelLMHead (class in tensorrt_llm.layers.linear)
permute() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
PhiForCausalLM (class in tensorrt_llm.models)
PhiModel (class in tensorrt_llm.models)
PluginConfig (class in tensorrt_llm.plugin)
PositionEmbeddingType (class in tensorrt_llm.functional)
post_layernorm (tensorrt_llm.functional.LayerNormPositionType attribute)
pow() (in module tensorrt_llm.functional)
pp_communicate_final_output_ids() (tensorrt_llm.runtime.GenerationSession method)
pp_communicate_new_tokens() (tensorrt_llm.runtime.GenerationSession method)
pre_layernorm (tensorrt_llm.functional.LayerNormPositionType attribute)
prepare_inputs() (tensorrt_llm.models.ChatGLMForCausalLM method)
(tensorrt_llm.models.DecoderModel method)
(tensorrt_llm.models.EncoderModel method)
(tensorrt_llm.models.MambaLMHeadModel method)
(tensorrt_llm.models.MedusaForCausalLm method)
(tensorrt_llm.models.PretrainedModel method)
(tensorrt_llm.models.WhisperEncoder method)
presence_penalty (tensorrt_llm.runtime.SamplingConfig attribute)
PretrainedConfig (class in tensorrt_llm.models)
PretrainedModel (class in tensorrt_llm.models)
process_logits_for_medusa_mode() (tensorrt_llm.runtime.GenerationSession method)
PromptTuningEmbedding (class in tensorrt_llm.layers.embedding)
PUSH_MODE (tensorrt_llm.functional.AllReduceConfig attribute)
Q
QKVColumnLinear (class in tensorrt_llm.layers.linear)
quant_mode (tensorrt_llm.models.PretrainedConfig property)
(tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
QuantAlgo (class in tensorrt_llm.quantization)
quantize() (tensorrt_llm.models.LLaMAForCausalLM class method)
(tensorrt_llm.models.PretrainedModel class method)
quantize_and_export() (in module tensorrt_llm.quantization)
quantize_model() (in module tensorrt_llm.models)
QuantMode (class in tensorrt_llm.quantization)
QWenForCausalLM (class in tensorrt_llm.models)
QWenForCausalLMGenerationSession (class in tensorrt_llm.runtime)
R
random_seed (tensorrt_llm.runtime.SamplingConfig attribute)
rank() (tensorrt_llm.functional.Tensor method)
recv() (in module tensorrt_llm.functional)
relative (tensorrt_llm.functional.PositionEmbeddingType attribute)
release() (tensorrt_llm.models.PretrainedModel method)
relu() (in module tensorrt_llm.functional)
remove_input_padding (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
(tensorrt_llm.runtime.ModelRunner property)
(tensorrt_llm.runtime.ModelRunnerCpp property)
repeat_interleave() (in module tensorrt_llm.functional)
repetition_penalty (tensorrt_llm.runtime.SamplingConfig attribute)
replace_all_uses_with() (tensorrt_llm.functional.Tensor method)
return_dict (tensorrt_llm.runtime.SamplingConfig attribute)
rms_norm() (in module tensorrt_llm.functional)
RmsNorm (class in tensorrt_llm.layers.normalization)
(tensorrt_llm.functional.LayerNormType attribute)
rope_gpt_neox (tensorrt_llm.functional.PositionEmbeddingType attribute)
rope_gptj (tensorrt_llm.functional.PositionEmbeddingType attribute)
RopeEmbeddingUtils (class in tensorrt_llm.layers.attention)
RotaryScalingType (class in tensorrt_llm.functional)
rotate_every_two() (tensorrt_llm.layers.attention.RopeEmbeddingUtils static method)
rotate_half() (tensorrt_llm.layers.attention.RopeEmbeddingUtils static method)
round() (in module tensorrt_llm.functional)
RowLinear (class in tensorrt_llm.layers.linear)
run() (tensorrt_llm.runtime.Session method)
runtime (tensorrt_llm.runtime.GenerationSession attribute)
(tensorrt_llm.runtime.Session property)
S
SamplingConfig (class in tensorrt_llm.runtime)
save_checkpoint() (tensorrt_llm.models.PretrainedModel method)
select() (in module tensorrt_llm.functional)
selective_scan() (in module tensorrt_llm.functional)
send() (in module tensorrt_llm.functional)
serialize_engine() (tensorrt_llm.runtime.ModelRunner method)
Session (class in tensorrt_llm.runtime)
SET_FROM_OPTIONAL (C macro)
set_if_not_exist() (tensorrt_llm.models.PretrainedConfig method)
set_rank() (tensorrt_llm.models.PretrainedConfig method)
set_shapes() (tensorrt_llm.runtime.Session method)
setup() (tensorrt_llm.runtime.GenerationSession method)
(tensorrt_llm.runtime.MambaLMHeadModelGenerationSession method)
shape (tensorrt_llm.functional.Tensor property)
(tensorrt_llm.runtime.TensorInfo attribute)
shape() (in module tensorrt_llm.functional)
sigmoid() (in module tensorrt_llm.functional)
silu() (in module tensorrt_llm.functional)
sin() (in module tensorrt_llm.functional)
sink_token_length (tensorrt_llm.runtime.SamplingConfig attribute)
size() (tensorrt_llm.functional.Tensor method)
skip_cross_qkv (tensorrt_llm.runtime.ModelConfig attribute)
slice() (in module tensorrt_llm.functional)
softmax() (in module tensorrt_llm.functional)
softplus() (in module tensorrt_llm.functional)
split() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
sqrt() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
squared_relu() (in module tensorrt_llm.functional)
stack() (in module tensorrt_llm.functional)
step() (tensorrt_llm.runtime.KVCacheManager method)
stop_words_list (tensorrt_llm.runtime.SamplingConfig attribute)
StoppingCriteria (class in tensorrt_llm.runtime)
StoppingCriteriaList (class in tensorrt_llm.runtime)
sub() (in module tensorrt_llm.functional)
sum() (in module tensorrt_llm.functional)
swiglu() (in module tensorrt_llm.functional)
T
tanh() (in module tensorrt_llm.functional)
temperature (tensorrt_llm.runtime.SamplingConfig attribute)
Tensor (class in tensorrt_llm.functional)
TensorInfo (class in tensorrt_llm.runtime)
tensorrt_llm
module
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
tensorrt_llm (C++ type)
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
,
[6]
,
[7]
,
[8]
,
[9]
,
[10]
,
[11]
,
[12]
,
[13]
,
[14]
,
[15]
,
[16]
,
[17]
,
[18]
,
[19]
,
[20]
,
[21]
,
[22]
,
[23]
,
[24]
,
[25]
,
[26]
,
[27]
,
[28]
,
[29]
tensorrt_llm.functional
module
tensorrt_llm.layers.activation
module
tensorrt_llm.layers.attention
module
tensorrt_llm.layers.cast
module
tensorrt_llm.layers.conv
module
tensorrt_llm.layers.embedding
module
tensorrt_llm.layers.linear
module
tensorrt_llm.layers.mlp
module
tensorrt_llm.layers.normalization
module
tensorrt_llm.layers.pooling
module
tensorrt_llm.models
module
tensorrt_llm.plugin
module
tensorrt_llm.quantization
module
tensorrt_llm.runtime
module
tensorrt_llm::batch_manager (C++ type)
tensorrt_llm::batch_manager::kv_cache_manager (C++ type)
tensorrt_llm::executor (C++ type)
,
[1]
,
[2]
tensorrt_llm::executor::BatchingType (C++ enum)
tensorrt_llm::executor::BatchingType::kINFLIGHT (C++ enumerator)
tensorrt_llm::executor::BatchingType::kSTATIC (C++ enumerator)
tensorrt_llm::executor::BeamTokens (C++ type)
tensorrt_llm::executor::CommunicationMode (C++ enum)
tensorrt_llm::executor::CommunicationMode::kLEADER (C++ enumerator)
tensorrt_llm::executor::CommunicationType (C++ enum)
tensorrt_llm::executor::CommunicationType::kMPI (C++ enumerator)
tensorrt_llm::executor::DataType (C++ enum)
tensorrt_llm::executor::DataType::kBF16 (C++ enumerator)
tensorrt_llm::executor::DataType::kBOOL (C++ enumerator)
tensorrt_llm::executor::DataType::kFP16 (C++ enumerator)
tensorrt_llm::executor::DataType::kFP32 (C++ enumerator)
tensorrt_llm::executor::DataType::kFP8 (C++ enumerator)
tensorrt_llm::executor::DataType::kINT32 (C++ enumerator)
tensorrt_llm::executor::DataType::kINT64 (C++ enumerator)
tensorrt_llm::executor::DataType::kINT8 (C++ enumerator)
tensorrt_llm::executor::DataType::kUINT8 (C++ enumerator)
tensorrt_llm::executor::DataType::kUNKNOWN (C++ enumerator)
tensorrt_llm::executor::detail (C++ type)
tensorrt_llm::executor::detail::ofITensor (C++ function)
tensorrt_llm::executor::detail::toITensor (C++ function)
tensorrt_llm::executor::Executor (C++ class)
tensorrt_llm::executor::Executor::awaitResponses (C++ function)
tensorrt_llm::executor::Executor::cancelRequest (C++ function)
tensorrt_llm::executor::Executor::canEnqueueRequests (C++ function)
tensorrt_llm::executor::Executor::enqueueRequest (C++ function)
tensorrt_llm::executor::Executor::enqueueRequests (C++ function)
tensorrt_llm::executor::Executor::Executor (C++ function)
,
[1]
,
[2]
tensorrt_llm::executor::Executor::getLatestIterationStats (C++ function)
tensorrt_llm::executor::Executor::getLatestRequestStats (C++ function)
tensorrt_llm::executor::Executor::getNumResponsesReady (C++ function)
tensorrt_llm::executor::Executor::mImpl (C++ member)
tensorrt_llm::executor::Executor::shutdown (C++ function)
tensorrt_llm::executor::Executor::~Executor (C++ function)
tensorrt_llm::executor::ExecutorConfig (C++ class)
tensorrt_llm::executor::ExecutorConfig::ExecutorConfig (C++ function)
tensorrt_llm::executor::ExecutorConfig::getBatchingType (C++ function)
tensorrt_llm::executor::ExecutorConfig::getEnableChunkedContext (C++ function)
tensorrt_llm::executor::ExecutorConfig::getIterStatsMaxIterations (C++ function)
tensorrt_llm::executor::ExecutorConfig::getKvCacheConfig (C++ function)
tensorrt_llm::executor::ExecutorConfig::getLogitsPostProcessorMap (C++ function)
tensorrt_llm::executor::ExecutorConfig::getMaxBeamWidth (C++ function)
tensorrt_llm::executor::ExecutorConfig::getMedusaChoices (C++ function)
tensorrt_llm::executor::ExecutorConfig::getNormalizeLogProbs (C++ function)
tensorrt_llm::executor::ExecutorConfig::getParallelConfig (C++ function)
tensorrt_llm::executor::ExecutorConfig::getPeftCacheConfig (C++ function)
tensorrt_llm::executor::ExecutorConfig::getRequestStatsMaxIterations (C++ function)
tensorrt_llm::executor::ExecutorConfig::getSchedulerConfig (C++ function)
tensorrt_llm::executor::ExecutorConfig::mBatchingType (C++ member)
tensorrt_llm::executor::ExecutorConfig::mEnableChunkedContext (C++ member)
tensorrt_llm::executor::ExecutorConfig::mIterStatsMaxIterations (C++ member)
tensorrt_llm::executor::ExecutorConfig::mKvCacheConfig (C++ member)
tensorrt_llm::executor::ExecutorConfig::mLogitsPostProcessorMap (C++ member)
tensorrt_llm::executor::ExecutorConfig::mMaxBeamWidth (C++ member)
tensorrt_llm::executor::ExecutorConfig::mMedusaChoices (C++ member)
tensorrt_llm::executor::ExecutorConfig::mNormalizeLogProbs (C++ member)
tensorrt_llm::executor::ExecutorConfig::mParallelConfig (C++ member)
tensorrt_llm::executor::ExecutorConfig::mPeftCacheConfig (C++ member)
tensorrt_llm::executor::ExecutorConfig::mRequestStatsMaxIterations (C++ member)
tensorrt_llm::executor::ExecutorConfig::mSchedulerConfig (C++ member)
tensorrt_llm::executor::ExecutorConfig::setBatchingType (C++ function)
tensorrt_llm::executor::ExecutorConfig::setEnableChunkedContext (C++ function)
tensorrt_llm::executor::ExecutorConfig::setIterStatsMaxIterations (C++ function)
tensorrt_llm::executor::ExecutorConfig::setKvCacheConfig (C++ function)
tensorrt_llm::executor::ExecutorConfig::setLogitsPostProcessorMap (C++ function)
tensorrt_llm::executor::ExecutorConfig::setMaxBeamWidth (C++ function)
tensorrt_llm::executor::ExecutorConfig::setMedusaChoices (C++ function)
tensorrt_llm::executor::ExecutorConfig::setNormalizeLogProbs (C++ function)
tensorrt_llm::executor::ExecutorConfig::setParallelConfig (C++ function)
tensorrt_llm::executor::ExecutorConfig::setPeftCacheConfig (C++ function)
tensorrt_llm::executor::ExecutorConfig::setRequestStatsMaxIterations (C++ function)
tensorrt_llm::executor::ExecutorConfig::setSchedulerConfig (C++ function)
tensorrt_llm::executor::FloatType (C++ type)
tensorrt_llm::executor::IdType (C++ type)
tensorrt_llm::executor::InflightBatchingStats (C++ struct)
tensorrt_llm::executor::InflightBatchingStats::microBatchId (C++ member)
tensorrt_llm::executor::InflightBatchingStats::numContextRequests (C++ member)
tensorrt_llm::executor::InflightBatchingStats::numCtxTokens (C++ member)
tensorrt_llm::executor::InflightBatchingStats::numGenRequests (C++ member)
tensorrt_llm::executor::InflightBatchingStats::numPausedRequests (C++ member)
tensorrt_llm::executor::InflightBatchingStats::numScheduledRequests (C++ member)
tensorrt_llm::executor::IterationStats (C++ struct)
tensorrt_llm::executor::IterationStats::cpuMemUsage (C++ member)
tensorrt_llm::executor::IterationStats::gpuMemUsage (C++ member)
tensorrt_llm::executor::IterationStats::inflightBatchingStats (C++ member)
tensorrt_llm::executor::IterationStats::iter (C++ member)
tensorrt_llm::executor::IterationStats::kvCacheStats (C++ member)
tensorrt_llm::executor::IterationStats::maxNumActiveRequests (C++ member)
tensorrt_llm::executor::IterationStats::numActiveRequests (C++ member)
tensorrt_llm::executor::IterationStats::pinnedMemUsage (C++ member)
tensorrt_llm::executor::IterationStats::staticBatchingStats (C++ member)
tensorrt_llm::executor::IterationStats::timestamp (C++ member)
tensorrt_llm::executor::IterationType (C++ type)
tensorrt_llm::executor::JsonSerialization (C++ class)
tensorrt_llm::executor::JsonSerialization::toJsonStr (C++ function)
,
[1]
,
[2]
tensorrt_llm::executor::kDefaultIterStatsMaxIterations (C++ member)
tensorrt_llm::executor::kDefaultRequestStatsMaxIterations (C++ member)
tensorrt_llm::executor::KvCacheConfig (C++ class)
tensorrt_llm::executor::KvCacheConfig::getEnableBlockReuse (C++ function)
tensorrt_llm::executor::KvCacheConfig::getFreeGpuMemoryFraction (C++ function)
tensorrt_llm::executor::KvCacheConfig::getHostCacheSize (C++ function)
tensorrt_llm::executor::KvCacheConfig::getMaxAttentionWindow (C++ function)
tensorrt_llm::executor::KvCacheConfig::getMaxTokens (C++ function)
tensorrt_llm::executor::KvCacheConfig::getOnboardBlocks (C++ function)
tensorrt_llm::executor::KvCacheConfig::getSinkTokenLength (C++ function)
tensorrt_llm::executor::KvCacheConfig::KvCacheConfig (C++ function)
tensorrt_llm::executor::KvCacheConfig::mEnableBlockReuse (C++ member)
tensorrt_llm::executor::KvCacheConfig::mFreeGpuMemoryFraction (C++ member)
tensorrt_llm::executor::KvCacheConfig::mHostCacheSize (C++ member)
tensorrt_llm::executor::KvCacheConfig::mMaxAttentionWindow (C++ member)
tensorrt_llm::executor::KvCacheConfig::mMaxTokens (C++ member)
tensorrt_llm::executor::KvCacheConfig::mOnboardBlocks (C++ member)
tensorrt_llm::executor::KvCacheConfig::mSinkTokenLength (C++ member)
tensorrt_llm::executor::KvCacheStats (C++ struct)
tensorrt_llm::executor::KvCacheStats::freeNumBlocks (C++ member)
tensorrt_llm::executor::KvCacheStats::maxNumBlocks (C++ member)
tensorrt_llm::executor::KvCacheStats::tokensPerBlock (C++ member)
tensorrt_llm::executor::KvCacheStats::usedNumBlocks (C++ member)
tensorrt_llm::executor::LogitsPostProcessor (C++ type)
tensorrt_llm::executor::LogitsPostProcessorMap (C++ type)
tensorrt_llm::executor::LoraConfig (C++ class)
tensorrt_llm::executor::LoraConfig::getConfig (C++ function)
tensorrt_llm::executor::LoraConfig::getTaskId (C++ function)
tensorrt_llm::executor::LoraConfig::getWeights (C++ function)
tensorrt_llm::executor::LoraConfig::LoraConfig (C++ function)
tensorrt_llm::executor::LoraConfig::mConfig (C++ member)
tensorrt_llm::executor::LoraConfig::mTaskId (C++ member)
tensorrt_llm::executor::LoraConfig::mWeights (C++ member)
tensorrt_llm::executor::MedusaChoices (C++ type)
tensorrt_llm::executor::MemoryType (C++ enum)
tensorrt_llm::executor::MemoryType::kCPU (C++ enumerator)
tensorrt_llm::executor::MemoryType::kCPU_PINNED (C++ enumerator)
tensorrt_llm::executor::MemoryType::kGPU (C++ enumerator)
tensorrt_llm::executor::MemoryType::kUNKNOWN (C++ enumerator)
tensorrt_llm::executor::MemoryType::kUVM (C++ enumerator)
tensorrt_llm::executor::ModelType (C++ enum)
tensorrt_llm::executor::ModelType::kDECODER_ONLY (C++ enumerator)
tensorrt_llm::executor::OutputConfig (C++ class)
tensorrt_llm::executor::OutputConfig::excludeInputFromOutput (C++ member)
tensorrt_llm::executor::OutputConfig::OutputConfig (C++ function)
tensorrt_llm::executor::OutputConfig::returnContextLogits (C++ member)
tensorrt_llm::executor::OutputConfig::returnGenerationLogits (C++ member)
tensorrt_llm::executor::OutputConfig::returnLogProbs (C++ member)
tensorrt_llm::executor::ParallelConfig (C++ class)
tensorrt_llm::executor::ParallelConfig::getCommunicationMode (C++ function)
tensorrt_llm::executor::ParallelConfig::getCommunicationType (C++ function)
tensorrt_llm::executor::ParallelConfig::getDeviceIds (C++ function)
tensorrt_llm::executor::ParallelConfig::getParticipantIds (C++ function)
tensorrt_llm::executor::ParallelConfig::mCommMode (C++ member)
tensorrt_llm::executor::ParallelConfig::mCommType (C++ member)
tensorrt_llm::executor::ParallelConfig::mDeviceIds (C++ member)
tensorrt_llm::executor::ParallelConfig::mParticipantIds (C++ member)
tensorrt_llm::executor::ParallelConfig::ParallelConfig (C++ function)
tensorrt_llm::executor::ParallelConfig::setCommunicationMode (C++ function)
tensorrt_llm::executor::ParallelConfig::setCommunicationType (C++ function)
tensorrt_llm::executor::ParallelConfig::setDeviceIds (C++ function)
tensorrt_llm::executor::ParallelConfig::setParticipantIds (C++ function)
tensorrt_llm::executor::PeftCacheConfig (C++ class)
tensorrt_llm::executor::PeftCacheConfig::getDeviceCachePercent (C++ function)
tensorrt_llm::executor::PeftCacheConfig::getHostCacheSize (C++ function)
tensorrt_llm::executor::PeftCacheConfig::getMaxAdapterSize (C++ function)
tensorrt_llm::executor::PeftCacheConfig::getMaxPagesPerBlockDevice (C++ function)
tensorrt_llm::executor::PeftCacheConfig::getMaxPagesPerBlockHost (C++ function)
tensorrt_llm::executor::PeftCacheConfig::getNumCopyStreams (C++ function)
tensorrt_llm::executor::PeftCacheConfig::getNumDeviceModuleLayer (C++ function)
tensorrt_llm::executor::PeftCacheConfig::getNumEnsureWorkers (C++ function)
tensorrt_llm::executor::PeftCacheConfig::getNumHostModuleLayer (C++ function)
tensorrt_llm::executor::PeftCacheConfig::getNumPutWorkers (C++ function)
tensorrt_llm::executor::PeftCacheConfig::getOptimalAdapterSize (C++ function)
tensorrt_llm::executor::PeftCacheConfig::mDeviceCachePercent (C++ member)
tensorrt_llm::executor::PeftCacheConfig::mHostCacheSize (C++ member)
tensorrt_llm::executor::PeftCacheConfig::mMaxAdapterSize (C++ member)
tensorrt_llm::executor::PeftCacheConfig::mMaxPagesPerBlockDevice (C++ member)
tensorrt_llm::executor::PeftCacheConfig::mMaxPagesPerBlockHost (C++ member)
tensorrt_llm::executor::PeftCacheConfig::mNumCopyStreams (C++ member)
tensorrt_llm::executor::PeftCacheConfig::mNumDeviceModuleLayer (C++ member)
tensorrt_llm::executor::PeftCacheConfig::mNumEnsureWorkers (C++ member)
tensorrt_llm::executor::PeftCacheConfig::mNumHostModuleLayer (C++ member)
tensorrt_llm::executor::PeftCacheConfig::mNumPutWorkers (C++ member)
tensorrt_llm::executor::PeftCacheConfig::mOptimalAdapterSize (C++ member)
tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig (C++ function)
tensorrt_llm::executor::PhonyNameDueToError::value (C++ member)
,
[1]
,
[2]
,
[3]
tensorrt_llm::executor::PromptTuningConfig (C++ class)
tensorrt_llm::executor::PromptTuningConfig::getEmbeddingTable (C++ function)
tensorrt_llm::executor::PromptTuningConfig::mEmbeddingTable (C++ member)
tensorrt_llm::executor::PromptTuningConfig::PromptTuningConfig (C++ function)
tensorrt_llm::executor::RandomSeedType (C++ type)
tensorrt_llm::executor::Request (C++ class)
tensorrt_llm::executor::Request::getBadWords (C++ function)
tensorrt_llm::executor::Request::getEmbeddingBias (C++ function)
tensorrt_llm::executor::Request::getEndId (C++ function)
tensorrt_llm::executor::Request::getInputTokenIds (C++ function)
tensorrt_llm::executor::Request::getLogitsPostProcessorName (C++ function)
tensorrt_llm::executor::Request::getLoraConfig (C++ function)
tensorrt_llm::executor::Request::getMaxNewTokens (C++ function)
tensorrt_llm::executor::Request::getOutputConfig (C++ function)
tensorrt_llm::executor::Request::getPadId (C++ function)
tensorrt_llm::executor::Request::getPromptTuningConfig (C++ function)
tensorrt_llm::executor::Request::getSamplingConfig (C++ function)
tensorrt_llm::executor::Request::getSpeculativeDecodingConfig (C++ function)
tensorrt_llm::executor::Request::getStopWords (C++ function)
tensorrt_llm::executor::Request::getStreaming (C++ function)
tensorrt_llm::executor::Request::mImpl (C++ member)
tensorrt_llm::executor::Request::operator= (C++ function)
,
[1]
tensorrt_llm::executor::Request::Request (C++ function)
,
[1]
,
[2]
tensorrt_llm::executor::Request::setBadWords (C++ function)
tensorrt_llm::executor::Request::setEmbeddingBias (C++ function)
tensorrt_llm::executor::Request::setEndId (C++ function)
tensorrt_llm::executor::Request::setLogitsPostProcessorName (C++ function)
tensorrt_llm::executor::Request::setLoraConfig (C++ function)
tensorrt_llm::executor::Request::setOutputConfig (C++ function)
tensorrt_llm::executor::Request::setPadId (C++ function)
tensorrt_llm::executor::Request::setPromptTuningConfig (C++ function)
tensorrt_llm::executor::Request::setSamplingConfig (C++ function)
tensorrt_llm::executor::Request::setSpeculativeDecodingConfig (C++ function)
tensorrt_llm::executor::Request::setStopWords (C++ function)
tensorrt_llm::executor::Request::setStreaming (C++ function)
tensorrt_llm::executor::Request::~Request (C++ function)
tensorrt_llm::executor::RequestStage (C++ enum)
tensorrt_llm::executor::RequestStage::kCONTEXT_IN_PROGRESS (C++ enumerator)
tensorrt_llm::executor::RequestStage::kGENERATION_COMPLETE (C++ enumerator)
tensorrt_llm::executor::RequestStage::kGENERATION_IN_PROGRESS (C++ enumerator)
tensorrt_llm::executor::RequestStage::kQUEUED (C++ enumerator)
tensorrt_llm::executor::RequestStats (C++ struct)
tensorrt_llm::executor::RequestStats::contextPrefillPosition (C++ member)
tensorrt_llm::executor::RequestStats::id (C++ member)
tensorrt_llm::executor::RequestStats::numGeneratedTokens (C++ member)
tensorrt_llm::executor::RequestStats::paused (C++ member)
tensorrt_llm::executor::RequestStats::scheduled (C++ member)
tensorrt_llm::executor::RequestStats::stage (C++ member)
tensorrt_llm::executor::RequestStatsPerIteration (C++ struct)
tensorrt_llm::executor::RequestStatsPerIteration::iter (C++ member)
tensorrt_llm::executor::RequestStatsPerIteration::requestStats (C++ member)
tensorrt_llm::executor::Response (C++ class)
tensorrt_llm::executor::Response::getErrorMsg (C++ function)
tensorrt_llm::executor::Response::getRequestId (C++ function)
tensorrt_llm::executor::Response::getResult (C++ function)
tensorrt_llm::executor::Response::hasError (C++ function)
tensorrt_llm::executor::Response::mImpl (C++ member)
tensorrt_llm::executor::Response::operator= (C++ function)
,
[1]
tensorrt_llm::executor::Response::Response (C++ function)
,
[1]
,
[2]
,
[3]
tensorrt_llm::executor::Response::~Response (C++ function)
tensorrt_llm::executor::Result (C++ struct)
tensorrt_llm::executor::Result::contextLogits (C++ member)
tensorrt_llm::executor::Result::cumLogProbs (C++ member)
tensorrt_llm::executor::Result::generationLogits (C++ member)
tensorrt_llm::executor::Result::isFinal (C++ member)
tensorrt_llm::executor::Result::logProbs (C++ member)
tensorrt_llm::executor::Result::outputTokenIds (C++ member)
tensorrt_llm::executor::SamplingConfig (C++ class)
tensorrt_llm::executor::SamplingConfig::getBeamSearchDiversityRate (C++ function)
tensorrt_llm::executor::SamplingConfig::getBeamWidth (C++ function)
tensorrt_llm::executor::SamplingConfig::getEarlyStopping (C++ function)
tensorrt_llm::executor::SamplingConfig::getFrequencyPenalty (C++ function)
tensorrt_llm::executor::SamplingConfig::getLengthPenalty (C++ function)
tensorrt_llm::executor::SamplingConfig::getMinLength (C++ function)
tensorrt_llm::executor::SamplingConfig::getPresencePenalty (C++ function)
tensorrt_llm::executor::SamplingConfig::getRandomSeed (C++ function)
tensorrt_llm::executor::SamplingConfig::getRepetitionPenalty (C++ function)
tensorrt_llm::executor::SamplingConfig::getTemperature (C++ function)
tensorrt_llm::executor::SamplingConfig::getTopK (C++ function)
tensorrt_llm::executor::SamplingConfig::getTopP (C++ function)
tensorrt_llm::executor::SamplingConfig::getTopPDecay (C++ function)
tensorrt_llm::executor::SamplingConfig::getTopPMin (C++ function)
tensorrt_llm::executor::SamplingConfig::getTopPResetIds (C++ function)
tensorrt_llm::executor::SamplingConfig::mBeamSearchDiversityRate (C++ member)
tensorrt_llm::executor::SamplingConfig::mBeamWidth (C++ member)
tensorrt_llm::executor::SamplingConfig::mEarlyStopping (C++ member)
tensorrt_llm::executor::SamplingConfig::mFrequencyPenalty (C++ member)
tensorrt_llm::executor::SamplingConfig::mLengthPenalty (C++ member)
tensorrt_llm::executor::SamplingConfig::mMinLength (C++ member)
tensorrt_llm::executor::SamplingConfig::mPresencePenalty (C++ member)
tensorrt_llm::executor::SamplingConfig::mRandomSeed (C++ member)
tensorrt_llm::executor::SamplingConfig::mRepetitionPenalty (C++ member)
tensorrt_llm::executor::SamplingConfig::mTemperature (C++ member)
tensorrt_llm::executor::SamplingConfig::mTopK (C++ member)
tensorrt_llm::executor::SamplingConfig::mTopP (C++ member)
tensorrt_llm::executor::SamplingConfig::mTopPDecay (C++ member)
tensorrt_llm::executor::SamplingConfig::mTopPMin (C++ member)
tensorrt_llm::executor::SamplingConfig::mTopPResetIds (C++ member)
tensorrt_llm::executor::SamplingConfig::operator== (C++ function)
tensorrt_llm::executor::SamplingConfig::SamplingConfig (C++ function)
tensorrt_llm::executor::SchedulerConfig (C++ class)
tensorrt_llm::executor::SchedulerConfig::getPolicy (C++ function)
tensorrt_llm::executor::SchedulerConfig::mPolicy (C++ member)
tensorrt_llm::executor::SchedulerConfig::SchedulerConfig (C++ function)
tensorrt_llm::executor::SchedulerPolicy (C++ enum)
tensorrt_llm::executor::SchedulerPolicy::kGUARANTEED_NO_EVICT (C++ enumerator)
tensorrt_llm::executor::SchedulerPolicy::kMAX_UTILIZATION (C++ enumerator)
tensorrt_llm::executor::Shape (C++ class)
tensorrt_llm::executor::Shape::Base (C++ type)
tensorrt_llm::executor::Shape::DimType (C++ type)
tensorrt_llm::executor::Shape::Shape (C++ function)
,
[1]
,
[2]
tensorrt_llm::executor::SizeType (C++ type)
tensorrt_llm::executor::SpeculativeDecodingConfig (C++ class)
tensorrt_llm::executor::SpeculativeDecodingConfig::getAcceptanceThreshold (C++ function)
tensorrt_llm::executor::SpeculativeDecodingConfig::getLogits (C++ function)
tensorrt_llm::executor::SpeculativeDecodingConfig::getTokens (C++ function)
tensorrt_llm::executor::SpeculativeDecodingConfig::mAcceptanceThreshold (C++ member)
tensorrt_llm::executor::SpeculativeDecodingConfig::mLogits (C++ member)
tensorrt_llm::executor::SpeculativeDecodingConfig::mTokens (C++ member)
tensorrt_llm::executor::SpeculativeDecodingConfig::SpeculativeDecodingConfig (C++ function)
tensorrt_llm::executor::StaticBatchingStats (C++ struct)
tensorrt_llm::executor::StaticBatchingStats::emptyGenSlots (C++ member)
tensorrt_llm::executor::StaticBatchingStats::numContextRequests (C++ member)
tensorrt_llm::executor::StaticBatchingStats::numCtxTokens (C++ member)
tensorrt_llm::executor::StaticBatchingStats::numGenTokens (C++ member)
tensorrt_llm::executor::StaticBatchingStats::numScheduledRequests (C++ member)
tensorrt_llm::executor::StreamPtr (C++ type)
tensorrt_llm::executor::Tensor (C++ class)
tensorrt_llm::executor::Tensor::copyTo (C++ function)
tensorrt_llm::executor::Tensor::copyToCpu (C++ function)
tensorrt_llm::executor::Tensor::copyToGpu (C++ function)
tensorrt_llm::executor::Tensor::copyToManaged (C++ function)
tensorrt_llm::executor::Tensor::copyToPinned (C++ function)
tensorrt_llm::executor::Tensor::copyToPooledPinned (C++ function)
tensorrt_llm::executor::Tensor::cpu (C++ function)
,
[1]
tensorrt_llm::executor::Tensor::CudaStreamPtr (C++ type)
tensorrt_llm::executor::Tensor::detail::ofITensor (C++ function)
tensorrt_llm::executor::Tensor::detail::toITensor (C++ function)
tensorrt_llm::executor::Tensor::getData (C++ function)
,
[1]
tensorrt_llm::executor::Tensor::getDataType (C++ function)
tensorrt_llm::executor::Tensor::getMemoryType (C++ function)
tensorrt_llm::executor::Tensor::getRuntimeType (C++ function)
tensorrt_llm::executor::Tensor::getShape (C++ function)
tensorrt_llm::executor::Tensor::getSize (C++ function)
tensorrt_llm::executor::Tensor::getSizeInBytes (C++ function)
tensorrt_llm::executor::Tensor::gpu (C++ function)
,
[1]
tensorrt_llm::executor::Tensor::Impl (C++ type)
tensorrt_llm::executor::Tensor::managed (C++ function)
,
[1]
tensorrt_llm::executor::Tensor::mTensor (C++ member)
tensorrt_llm::executor::Tensor::of (C++ function)
,
[1]
,
[2]
tensorrt_llm::executor::Tensor::operator bool (C++ function)
tensorrt_llm::executor::Tensor::operator!= (C++ function)
tensorrt_llm::executor::Tensor::operator= (C++ function)
,
[1]
tensorrt_llm::executor::Tensor::operator== (C++ function)
tensorrt_llm::executor::Tensor::pinned (C++ function)
,
[1]
tensorrt_llm::executor::Tensor::pooledPinned (C++ function)
,
[1]
tensorrt_llm::executor::Tensor::setFrom (C++ function)
tensorrt_llm::executor::Tensor::setZero (C++ function)
tensorrt_llm::executor::Tensor::Tensor (C++ function)
,
[1]
,
[2]
,
[3]
tensorrt_llm::executor::Tensor::~Tensor (C++ function)
tensorrt_llm::executor::TensorPtr (C++ type)
tensorrt_llm::executor::TokenIdType (C++ type)
tensorrt_llm::executor::TypeTraits (C++ struct)
tensorrt_llm::executor::TypeTraits<bool> (C++ struct)
tensorrt_llm::executor::TypeTraits<bool>::value (C++ member)
tensorrt_llm::executor::TypeTraits<float> (C++ struct)
tensorrt_llm::executor::TypeTraits<float>::value (C++ member)
tensorrt_llm::executor::TypeTraits<half> (C++ struct)
tensorrt_llm::executor::TypeTraits<half>::value (C++ member)
tensorrt_llm::executor::TypeTraits<std::int32_t> (C++ struct)
tensorrt_llm::executor::TypeTraits<std::int32_t>::value (C++ member)
tensorrt_llm::executor::TypeTraits<std::int64_t> (C++ struct)
tensorrt_llm::executor::TypeTraits<std::int64_t>::value (C++ member)
tensorrt_llm::executor::TypeTraits<std::int8_t> (C++ struct)
tensorrt_llm::executor::TypeTraits<std::int8_t>::value (C++ member)
tensorrt_llm::executor::TypeTraits<std::uint8_t> (C++ struct)
tensorrt_llm::executor::TypeTraits<std::uint8_t>::value (C++ member)
tensorrt_llm::executor::TypeTraits<T*> (C++ struct)
tensorrt_llm::executor::TypeTraits<T*>::value (C++ member)
tensorrt_llm::executor::VecLogProbs (C++ type)
tensorrt_llm::executor::VecTokens (C++ type)
tensorrt_llm::layers (C++ type)
tensorrt_llm::runtime (C++ type)
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
,
[6]
,
[7]
,
[8]
,
[9]
,
[10]
,
[11]
,
[12]
,
[13]
,
[14]
,
[15]
,
[16]
,
[17]
,
[18]
,
[19]
,
[20]
,
[21]
,
[22]
,
[23]
,
[24]
,
[25]
,
[26]
,
[27]
,
[28]
tensorrt_llm::runtime::bufferCast (C++ function)
,
[1]
tensorrt_llm::runtime::BufferDataType (C++ class)
tensorrt_llm::runtime::BufferDataType::BufferDataType (C++ function)
tensorrt_llm::runtime::BufferDataType::getDataType (C++ function)
tensorrt_llm::runtime::BufferDataType::getSize (C++ function)
tensorrt_llm::runtime::BufferDataType::isPointer (C++ function)
tensorrt_llm::runtime::BufferDataType::isUnsigned (C++ function)
tensorrt_llm::runtime::BufferDataType::kTrtPointerType (C++ member)
tensorrt_llm::runtime::BufferDataType::mDataType (C++ member)
tensorrt_llm::runtime::BufferDataType::mPointer (C++ member)
tensorrt_llm::runtime::BufferDataType::mUnsigned (C++ member)
tensorrt_llm::runtime::BufferDataType::operator nvinfer1::DataType (C++ function)
tensorrt_llm::runtime::BufferManager (C++ class)
tensorrt_llm::runtime::BufferManager::allocate (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::BufferManager (C++ function)
tensorrt_llm::runtime::BufferManager::copy (C++ function)
,
[1]
,
[2]
,
[3]
,
[4]
tensorrt_llm::runtime::BufferManager::copyFrom (C++ function)
,
[1]
,
[2]
,
[3]
,
[4]
tensorrt_llm::runtime::BufferManager::cpu (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::CudaStreamPtr (C++ type)
tensorrt_llm::runtime::BufferManager::emptyBuffer (C++ function)
tensorrt_llm::runtime::BufferManager::emptyTensor (C++ function)
tensorrt_llm::runtime::BufferManager::getStream (C++ function)
tensorrt_llm::runtime::BufferManager::gpu (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::gpuSync (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::IBufferPtr (C++ type)
tensorrt_llm::runtime::BufferManager::initMemoryPool (C++ function)
tensorrt_llm::runtime::BufferManager::ITensorPtr (C++ type)
tensorrt_llm::runtime::BufferManager::kBYTE_TYPE (C++ member)
tensorrt_llm::runtime::BufferManager::managed (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::memoryPoolFree (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::memoryPoolReserved (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::memoryPoolTrimTo (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::memoryPoolUsed (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::mStream (C++ member)
tensorrt_llm::runtime::BufferManager::mTrimPool (C++ member)
tensorrt_llm::runtime::BufferManager::pinned (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::pinnedPool (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::setMem (C++ function)
tensorrt_llm::runtime::BufferManager::setZero (C++ function)
tensorrt_llm::runtime::BufferManager::~BufferManager (C++ function)
tensorrt_llm::runtime::BufferRange (C++ class)
tensorrt_llm::runtime::BufferRange::Base (C++ type)
tensorrt_llm::runtime::BufferRange::BufferRange (C++ function)
,
[1]
tensorrt_llm::runtime::constPointerCast (C++ function)
,
[1]
tensorrt_llm::runtime::CudaEvent (C++ class)
tensorrt_llm::runtime::CudaEvent::CudaEvent (C++ function)
,
[1]
tensorrt_llm::runtime::CudaEvent::Deleter (C++ class)
tensorrt_llm::runtime::CudaEvent::Deleter::Deleter (C++ function)
,
[1]
tensorrt_llm::runtime::CudaEvent::Deleter::mOwnsEvent (C++ member)
tensorrt_llm::runtime::CudaEvent::Deleter::operator() (C++ function)
tensorrt_llm::runtime::CudaEvent::element_type (C++ type)
tensorrt_llm::runtime::CudaEvent::EventPtr (C++ type)
tensorrt_llm::runtime::CudaEvent::get (C++ function)
tensorrt_llm::runtime::CudaEvent::mEvent (C++ member)
tensorrt_llm::runtime::CudaEvent::pointer (C++ type)
tensorrt_llm::runtime::CudaEvent::synchronize (C++ function)
tensorrt_llm::runtime::CudaStream (C++ class)
tensorrt_llm::runtime::CudaStream::CudaStream (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::CudaStream::Deleter (C++ class)
tensorrt_llm::runtime::CudaStream::Deleter::Deleter (C++ function)
,
[1]
tensorrt_llm::runtime::CudaStream::Deleter::mOwnsStream (C++ member)
tensorrt_llm::runtime::CudaStream::Deleter::operator() (C++ function)
tensorrt_llm::runtime::CudaStream::get (C++ function)
tensorrt_llm::runtime::CudaStream::getDevice (C++ function)
tensorrt_llm::runtime::CudaStream::mDevice (C++ member)
tensorrt_llm::runtime::CudaStream::mStream (C++ member)
tensorrt_llm::runtime::CudaStream::record (C++ function)
,
[1]
tensorrt_llm::runtime::CudaStream::StreamPtr (C++ type)
tensorrt_llm::runtime::CudaStream::synchronize (C++ function)
tensorrt_llm::runtime::CudaStream::wait (C++ function)
,
[1]
tensorrt_llm::runtime::DataTypeTraits (C++ struct)
tensorrt_llm::runtime::DataTypeTraits<kDataType, kUnsigned, true> (C++ struct)
tensorrt_llm::runtime::DataTypeTraits<kDataType, kUnsigned, true>::name (C++ member)
tensorrt_llm::runtime::DataTypeTraits<kDataType, kUnsigned, true>::size (C++ member)
tensorrt_llm::runtime::DataTypeTraits<kDataType, kUnsigned, true>::type (C++ type)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned> (C++ struct)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned>::name (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned>::size (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned>::type (C++ type)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kFLOAT> (C++ struct)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kFLOAT>::name (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kFLOAT>::size (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kFLOAT>::type (C++ type)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kHALF> (C++ struct)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kHALF>::name (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kHALF>::size (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kHALF>::type (C++ type)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT32, true> (C++ struct)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT32, true>::name (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT32, true>::size (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT32, true>::type (C++ type)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT32> (C++ struct)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT32>::name (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT32>::size (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT32>::type (C++ type)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT64, true> (C++ struct)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT64, true>::name (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT64, true>::size (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT64, true>::type (C++ type)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT64> (C++ struct)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT64>::name (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT64>::size (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT64>::type (C++ type)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT8> (C++ struct)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT8>::name (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT8>::size (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kINT8>::type (C++ type)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned> (C++ struct)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned>::name (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned>::size (C++ member)
tensorrt_llm::runtime::DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned>::type (C++ type)
tensorrt_llm::runtime::decoder (C++ type)
tensorrt_llm::runtime::decoder::Input (C++ class)
tensorrt_llm::runtime::decoder::Input::cacheIndirection (C++ member)
tensorrt_llm::runtime::decoder::Input::Input (C++ function)
tensorrt_llm::runtime::decoder::Input::logits (C++ member)
tensorrt_llm::runtime::decoder::Input::TensorPtr (C++ type)
tensorrt_llm::runtime::decoder::Output (C++ class)
tensorrt_llm::runtime::decoder::Output::cacheIndirection (C++ member)
tensorrt_llm::runtime::decoder::Output::Output (C++ function)
tensorrt_llm::runtime::decoder::Output::sequenceLengths (C++ member)
tensorrt_llm::runtime::decoder::Output::TensorPtr (C++ type)
tensorrt_llm::runtime::decoder_batch (C++ type)
tensorrt_llm::runtime::decoder_batch::Input (C++ class)
tensorrt_llm::runtime::decoder_batch::Input::active (C++ member)
tensorrt_llm::runtime::decoder_batch::Input::cacheIndirection (C++ member)
tensorrt_llm::runtime::decoder_batch::Input::Input (C++ function)
,
[1]
,
[2]
,
[3]
tensorrt_llm::runtime::decoder_batch::Input::logits (C++ member)
tensorrt_llm::runtime::decoder_batch::Input::medusaLogits (C++ member)
tensorrt_llm::runtime::decoder_batch::Input::TensorConstPtr (C++ type)
tensorrt_llm::runtime::decoder_batch::Input::TensorPtr (C++ type)
tensorrt_llm::runtime::decoder_batch::Output (C++ type)
tensorrt_llm::runtime::decoder_batch::Request (C++ class)
tensorrt_llm::runtime::decoder_batch::Request::badWordsList (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::BufferPtr (C++ type)
tensorrt_llm::runtime::decoder_batch::Request::computeCumLogProbs (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::computeLogProbs (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::ConstTensorPtr (C++ type)
tensorrt_llm::runtime::decoder_batch::Request::draftLogits (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::draftTokens (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::embeddingBias (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::endId (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::generatedTokensPerEngineStep (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::ids (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::inputLen (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::maxNewTokens (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::medusaPaths (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::medusaTreeIds (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::Request (C++ function)
tensorrt_llm::runtime::decoder_batch::Request::stopWordsList (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::TensorPtr (C++ type)
tensorrt_llm::runtime::decoder_batch::Token (C++ class)
tensorrt_llm::runtime::decoder_batch::Token::active (C++ member)
tensorrt_llm::runtime::decoder_batch::Token::event (C++ member)
tensorrt_llm::runtime::decoder_batch::Token::Token (C++ function)
tensorrt_llm::runtime::DecodingInput (C++ class)
tensorrt_llm::runtime::DecodingInput::badWordsLens (C++ member)
tensorrt_llm::runtime::DecodingInput::badWordsList (C++ member)
tensorrt_llm::runtime::DecodingInput::badWordsPtrs (C++ member)
tensorrt_llm::runtime::DecodingInput::batchSlots (C++ member)
tensorrt_llm::runtime::DecodingInput::cacheIndirection (C++ member)
tensorrt_llm::runtime::DecodingInput::DecodingInput (C++ function)
tensorrt_llm::runtime::DecodingInput::embeddingBias (C++ member)
tensorrt_llm::runtime::DecodingInput::endIds (C++ member)
tensorrt_llm::runtime::DecodingInput::finished (C++ member)
tensorrt_llm::runtime::DecodingInput::lengths (C++ member)
tensorrt_llm::runtime::DecodingInput::logits (C++ member)
tensorrt_llm::runtime::DecodingInput::logitsVec (C++ member)
tensorrt_llm::runtime::DecodingInput::maxAttentionWindow (C++ member)
tensorrt_llm::runtime::DecodingInput::maxBadWordsLen (C++ member)
tensorrt_llm::runtime::DecodingInput::maxBatchSize (C++ member)
tensorrt_llm::runtime::DecodingInput::maxLength (C++ member)
tensorrt_llm::runtime::DecodingInput::maxStopWordsLen (C++ member)
tensorrt_llm::runtime::DecodingInput::MedusaInputs (C++ class)
tensorrt_llm::runtime::DecodingInput::medusaInputs (C++ member)
tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaCurTokensPerStep (C++ member)
tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaLogits (C++ member)
tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaPaths (C++ member)
tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaTargetTokensPerStep (C++ member)
tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaTreeIds (C++ member)
tensorrt_llm::runtime::DecodingInput::noRepeatNgramSize (C++ member)
tensorrt_llm::runtime::DecodingInput::sequenceLimitLength (C++ member)
tensorrt_llm::runtime::DecodingInput::sinkTokenLength (C++ member)
tensorrt_llm::runtime::DecodingInput::step (C++ member)
tensorrt_llm::runtime::DecodingInput::stopWordsLens (C++ member)
tensorrt_llm::runtime::DecodingInput::stopWordsList (C++ member)
tensorrt_llm::runtime::DecodingInput::stopWordsPtrs (C++ member)
tensorrt_llm::runtime::DecodingInput::TensorPtr (C++ type)
tensorrt_llm::runtime::DecodingMode (C++ class)
tensorrt_llm::runtime::DecodingMode::allBitSet (C++ function)
tensorrt_llm::runtime::DecodingMode::anyBitSet (C++ function)
tensorrt_llm::runtime::DecodingMode::BeamSearch (C++ function)
tensorrt_llm::runtime::DecodingMode::DecodingMode (C++ function)
tensorrt_llm::runtime::DecodingMode::isBeamSearch (C++ function)
tensorrt_llm::runtime::DecodingMode::isMedusa (C++ function)
tensorrt_llm::runtime::DecodingMode::isNone (C++ function)
tensorrt_llm::runtime::DecodingMode::isTopK (C++ function)
tensorrt_llm::runtime::DecodingMode::isTopKandTopP (C++ function)
tensorrt_llm::runtime::DecodingMode::isTopKorTopP (C++ function)
tensorrt_llm::runtime::DecodingMode::isTopP (C++ function)
tensorrt_llm::runtime::DecodingMode::kBeamSearch (C++ member)
tensorrt_llm::runtime::DecodingMode::kMedusa (C++ member)
tensorrt_llm::runtime::DecodingMode::kNone (C++ member)
tensorrt_llm::runtime::DecodingMode::kTopK (C++ member)
tensorrt_llm::runtime::DecodingMode::kTopKTopP (C++ member)
tensorrt_llm::runtime::DecodingMode::kTopP (C++ member)
tensorrt_llm::runtime::DecodingMode::Medusa (C++ function)
tensorrt_llm::runtime::DecodingMode::mState (C++ member)
tensorrt_llm::runtime::DecodingMode::None (C++ function)
tensorrt_llm::runtime::DecodingMode::operator== (C++ function)
tensorrt_llm::runtime::DecodingMode::TopK (C++ function)
tensorrt_llm::runtime::DecodingMode::TopKTopP (C++ function)
tensorrt_llm::runtime::DecodingMode::TopP (C++ function)
tensorrt_llm::runtime::DecodingMode::UnderlyingType (C++ type)
tensorrt_llm::runtime::DecodingOutput (C++ class)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses (C++ class)
tensorrt_llm::runtime::DecodingOutput::beamHypotheses (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::cumLogProbs (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::empty (C++ function)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::init (C++ function)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::isDone (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::logProbs (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::minNormedScores (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::normedScores (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::numBeams (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::outputIdsTgt (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::release (C++ function)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::reshape (C++ function)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::sequenceLengthsTgt (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::slice (C++ function)
tensorrt_llm::runtime::DecodingOutput::cacheIndirection (C++ member)
tensorrt_llm::runtime::DecodingOutput::cumLogProbs (C++ member)
tensorrt_llm::runtime::DecodingOutput::DecodingOutput (C++ function)
tensorrt_llm::runtime::DecodingOutput::finished (C++ member)
tensorrt_llm::runtime::DecodingOutput::finishedSum (C++ member)
tensorrt_llm::runtime::DecodingOutput::ids (C++ member)
tensorrt_llm::runtime::DecodingOutput::kNegativeInfinity (C++ member)
tensorrt_llm::runtime::DecodingOutput::lengths (C++ member)
tensorrt_llm::runtime::DecodingOutput::logProbs (C++ member)
tensorrt_llm::runtime::DecodingOutput::MedusaOutputs (C++ class)
tensorrt_llm::runtime::DecodingOutput::medusaOutputs (C++ member)
tensorrt_llm::runtime::DecodingOutput::MedusaOutputs::medusaAcceptedLengthsCumSum (C++ member)
tensorrt_llm::runtime::DecodingOutput::MedusaOutputs::medusaAcceptedTokensLen (C++ member)
tensorrt_llm::runtime::DecodingOutput::MedusaOutputs::medusaNextDraftTokens (C++ member)
tensorrt_llm::runtime::DecodingOutput::MedusaOutputs::medusaPathsOffsets (C++ member)
tensorrt_llm::runtime::DecodingOutput::newTokens (C++ member)
tensorrt_llm::runtime::DecodingOutput::newTokensSteps (C++ member)
tensorrt_llm::runtime::DecodingOutput::newTokensVec (C++ member)
tensorrt_llm::runtime::DecodingOutput::parentIds (C++ member)
tensorrt_llm::runtime::DecodingOutput::TensorPtr (C++ type)
tensorrt_llm::runtime::GenerationInput (C++ class)
tensorrt_llm::runtime::GenerationInput::Base (C++ type)
tensorrt_llm::runtime::GenerationInput::GenerationInput (C++ function)
tensorrt_llm::runtime::GenerationInput::TensorPtr (C++ type)
tensorrt_llm::runtime::GenerationOutput (C++ class)
tensorrt_llm::runtime::GenerationOutput::Base (C++ type)
tensorrt_llm::runtime::GenerationOutput::GenerationOutput (C++ function)
tensorrt_llm::runtime::GenerationOutput::TensorPtr (C++ type)
tensorrt_llm::runtime::GenericGenerationInput (C++ class)
tensorrt_llm::runtime::GenericGenerationInput::badWordsList (C++ member)
tensorrt_llm::runtime::GenericGenerationInput::embeddingBias (C++ member)
tensorrt_llm::runtime::GenericGenerationInput::endId (C++ member)
tensorrt_llm::runtime::GenericGenerationInput::GenericGenerationInput (C++ function)
tensorrt_llm::runtime::GenericGenerationInput::ids (C++ member)
tensorrt_llm::runtime::GenericGenerationInput::lengths (C++ member)
tensorrt_llm::runtime::GenericGenerationInput::maxNewTokens (C++ member)
tensorrt_llm::runtime::GenericGenerationInput::packed (C++ member)
tensorrt_llm::runtime::GenericGenerationInput::padId (C++ member)
tensorrt_llm::runtime::GenericGenerationInput::promptTuningParams (C++ member)
tensorrt_llm::runtime::GenericGenerationInput::stopWordsList (C++ member)
tensorrt_llm::runtime::GenericGenerationInput::TensorPtr (C++ type)
tensorrt_llm::runtime::GenericGenerationOutput (C++ class)
tensorrt_llm::runtime::GenericGenerationOutput::Callback (C++ type)
tensorrt_llm::runtime::GenericGenerationOutput::contextLogits (C++ member)
tensorrt_llm::runtime::GenericGenerationOutput::cumLogProbs (C++ member)
tensorrt_llm::runtime::GenericGenerationOutput::generationLogits (C++ member)
tensorrt_llm::runtime::GenericGenerationOutput::GenericGenerationOutput (C++ function)
tensorrt_llm::runtime::GenericGenerationOutput::ids (C++ member)
tensorrt_llm::runtime::GenericGenerationOutput::lengths (C++ member)
tensorrt_llm::runtime::GenericGenerationOutput::logProbs (C++ member)
tensorrt_llm::runtime::GenericGenerationOutput::onTokenGenerated (C++ member)
tensorrt_llm::runtime::GenericGenerationOutput::TensorPtr (C++ type)
tensorrt_llm::runtime::GenericPromptTuningParams (C++ class)
tensorrt_llm::runtime::GenericPromptTuningParams::embeddingTable (C++ member)
tensorrt_llm::runtime::GenericPromptTuningParams::GenericPromptTuningParams (C++ function)
tensorrt_llm::runtime::GenericPromptTuningParams::promptTuningEnabled (C++ member)
tensorrt_llm::runtime::GenericPromptTuningParams::SizeType (C++ type)
tensorrt_llm::runtime::GenericPromptTuningParams::tasks (C++ member)
tensorrt_llm::runtime::GenericPromptTuningParams::TensorPtr (C++ type)
tensorrt_llm::runtime::GenericPromptTuningParams::vocabSize (C++ member)
tensorrt_llm::runtime::GptDecoder (C++ class)
tensorrt_llm::runtime::GptDecoder::CudaStreamPtr (C++ type)
tensorrt_llm::runtime::GptDecoder::forward (C++ function)
tensorrt_llm::runtime::GptDecoder::forwardAsync (C++ function)
tensorrt_llm::runtime::GptDecoder::gatherTree (C++ function)
tensorrt_llm::runtime::GptDecoder::getSamplingConfig (C++ function)
tensorrt_llm::runtime::GptDecoder::GptDecoder (C++ function)
tensorrt_llm::runtime::GptDecoder::mDynamicDecodeLayer (C++ member)
tensorrt_llm::runtime::GptDecoder::mLogProbsTiled (C++ member)
tensorrt_llm::runtime::GptDecoder::mManager (C++ member)
tensorrt_llm::runtime::GptDecoder::mMaxBatchSize (C++ member)
tensorrt_llm::runtime::GptDecoder::mProp (C++ member)
tensorrt_llm::runtime::GptDecoder::mSamplingConfig (C++ member)
tensorrt_llm::runtime::GptDecoder::setup (C++ function)
tensorrt_llm::runtime::GptDecoder::TensorPtr (C++ type)
tensorrt_llm::runtime::GptDecoderBatch (C++ class)
tensorrt_llm::runtime::GptDecoderBatch::allocateMedusaBuffers (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::CudaStreamPtr (C++ type)
tensorrt_llm::runtime::GptDecoderBatch::DecodingInputPtr (C++ type)
tensorrt_llm::runtime::GptDecoderBatch::DecodingOutputPtr (C++ type)
tensorrt_llm::runtime::GptDecoderBatch::finalize (C++ function)
,
[1]
tensorrt_llm::runtime::GptDecoderBatch::forwardAsync (C++ function)
,
[1]
tensorrt_llm::runtime::GptDecoderBatch::forwardAsyncFusedDecoder (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::forwardAsyncUnfusedDecoder (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::forwardSync (C++ function)
,
[1]
tensorrt_llm::runtime::GptDecoderBatch::getAllNewTokens (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getCumLogProbs (C++ function)
,
[1]
tensorrt_llm::runtime::GptDecoderBatch::getFinished (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getLogProbs (C++ function)
,
[1]
tensorrt_llm::runtime::GptDecoderBatch::getMedusaAcceptedLengthsCumSum (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getMedusaAcceptedPackedPaths (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getNbFinished (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getNbSteps (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getNewTokens (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getNextDraftTokens (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getOutputIds (C++ function)
,
[1]
tensorrt_llm::runtime::GptDecoderBatch::getParentIds (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::GptDecoderBatch (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::GptDecoderPtr (C++ type)
tensorrt_llm::runtime::GptDecoderBatch::mAcceptByLogits (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mActualBatchSize (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mBatchSlotsAcceptLogits (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mBatchSlotsAcceptTokens (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mBatchSlotsDecoder (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mBatchSlotsSetup (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mBeamWidths (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mBufferManager (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mCurandStates (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mDecoders (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mDecodingInputs (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mDecodingOutputs (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mDraftLogits (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mDraftProbs (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mDraftTokenIds (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mFinished (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mFinishedSteps (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mFinishedSum (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mForwardEvent (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mForwardToken (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mFusedDecoder (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mGeneratedTokensPerEngineStep (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mJointDecodingInput (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mJointDecodingOutput (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mMaxAttentionWindow (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mMaxBadWordsLen (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mMaxNewTokens (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mMaxSequenceLength (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mMaxStopWordsLen (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mMaxTokensPerDecoderStep (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mMaxTokensPerEngineStep (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mNbSteps (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mNumDraftTokens (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mSinkTokenLength (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mStream (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mStreams (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mTargetLogitsPtrs (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mTargetProbs (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mUseMedusa (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mVocabSize (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mVocabSizePadded (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::newBatch (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::newRequest (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::newRequestMedusa (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::newRequests (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::newRequestSpeculativeDecoding (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::postProcessRequest (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::setup (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::setupMedusa (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::SharedConstPtr (C++ type)
tensorrt_llm::runtime::GptDecoderBatch::TensorPtr (C++ type)
tensorrt_llm::runtime::GptJsonConfig (C++ class)
tensorrt_llm::runtime::GptJsonConfig::engineFilename (C++ function)
,
[1]
tensorrt_llm::runtime::GptJsonConfig::getModelConfig (C++ function)
tensorrt_llm::runtime::GptJsonConfig::getName (C++ function)
tensorrt_llm::runtime::GptJsonConfig::getPipelineParallelism (C++ function)
tensorrt_llm::runtime::GptJsonConfig::getPrecision (C++ function)
tensorrt_llm::runtime::GptJsonConfig::getTensorParallelism (C++ function)
tensorrt_llm::runtime::GptJsonConfig::getVersion (C++ function)
tensorrt_llm::runtime::GptJsonConfig::getWorldSize (C++ function)
tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig (C++ function)
tensorrt_llm::runtime::GptJsonConfig::mGptModelConfig (C++ member)
tensorrt_llm::runtime::GptJsonConfig::mName (C++ member)
tensorrt_llm::runtime::GptJsonConfig::mPipelineParallelism (C++ member)
tensorrt_llm::runtime::GptJsonConfig::mPrecision (C++ member)
tensorrt_llm::runtime::GptJsonConfig::mTensorParallelism (C++ member)
tensorrt_llm::runtime::GptJsonConfig::mVersion (C++ member)
tensorrt_llm::runtime::GptJsonConfig::parse (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::GptModelConfig (C++ class)
tensorrt_llm::runtime::GptModelConfig::computeContextLogits (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::computeGenerationLogits (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::getContextFMHAForGeneration (C++ function)
tensorrt_llm::runtime::GptModelConfig::getDataType (C++ function)
tensorrt_llm::runtime::GptModelConfig::getHiddenSize (C++ function)
tensorrt_llm::runtime::GptModelConfig::getKvDataType (C++ function)
tensorrt_llm::runtime::GptModelConfig::getLoraModules (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMambaConfig (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxBatchSize (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxBeamWidth (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxDraftLen (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxInputLen (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxLoraRank (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxNumTokens (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxPromptEmbeddingTableSize (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxSequenceLen (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxTokensPerStep (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMedusaModule (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMlpHiddenSize (C++ function)
tensorrt_llm::runtime::GptModelConfig::getModelVariant (C++ function)
tensorrt_llm::runtime::GptModelConfig::getNbHeads (C++ function)
tensorrt_llm::runtime::GptModelConfig::getNbKvHeads (C++ function)
tensorrt_llm::runtime::GptModelConfig::getNbLayers (C++ function)
tensorrt_llm::runtime::GptModelConfig::getPagedContextFMHA (C++ function)
tensorrt_llm::runtime::GptModelConfig::getQuantMode (C++ function)
tensorrt_llm::runtime::GptModelConfig::getSizePerHead (C++ function)
tensorrt_llm::runtime::GptModelConfig::getTokensPerBlock (C++ function)
tensorrt_llm::runtime::GptModelConfig::getVocabSize (C++ function)
tensorrt_llm::runtime::GptModelConfig::getVocabSizePadded (C++ function)
tensorrt_llm::runtime::GptModelConfig::GptModelConfig (C++ function)
tensorrt_llm::runtime::GptModelConfig::hasMambaConfig (C++ function)
tensorrt_llm::runtime::GptModelConfig::isSsmBased (C++ function)
tensorrt_llm::runtime::GptModelConfig::isTransformerBased (C++ function)
tensorrt_llm::runtime::GptModelConfig::mComputeContextLogits (C++ member)
tensorrt_llm::runtime::GptModelConfig::mComputeGenerationLogits (C++ member)
tensorrt_llm::runtime::GptModelConfig::mDataType (C++ member)
tensorrt_llm::runtime::GptModelConfig::mHiddenSize (C++ member)
tensorrt_llm::runtime::GptModelConfig::mInputPacked (C++ member)
tensorrt_llm::runtime::GptModelConfig::mLoraModules (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMambaConfig (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMaxBatchSize (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMaxBeamWidth (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMaxDraftLen (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMaxInputLen (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMaxLoraRank (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMaxNumTokens (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMaxPromptEmbeddingTableSize (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMaxSequenceLen (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMedusaModule (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMlpHiddenSize (C++ member)
tensorrt_llm::runtime::GptModelConfig::mModelVariant (C++ member)
tensorrt_llm::runtime::GptModelConfig::mNbHeads (C++ member)
tensorrt_llm::runtime::GptModelConfig::mNbKvHeads (C++ member)
tensorrt_llm::runtime::GptModelConfig::mNbLayers (C++ member)
tensorrt_llm::runtime::GptModelConfig::ModelVariant (C++ enum)
tensorrt_llm::runtime::GptModelConfig::ModelVariant::kGlm (C++ enumerator)
tensorrt_llm::runtime::GptModelConfig::ModelVariant::kGpt (C++ enumerator)
tensorrt_llm::runtime::GptModelConfig::ModelVariant::kMamba (C++ enumerator)
tensorrt_llm::runtime::GptModelConfig::mPagedContextFMHA (C++ member)
tensorrt_llm::runtime::GptModelConfig::mPagedKvCache (C++ member)
tensorrt_llm::runtime::GptModelConfig::mPagedState (C++ member)
tensorrt_llm::runtime::GptModelConfig::mQuantMode (C++ member)
tensorrt_llm::runtime::GptModelConfig::mSizePerHead (C++ member)
tensorrt_llm::runtime::GptModelConfig::mTokensPerBlock (C++ member)
tensorrt_llm::runtime::GptModelConfig::mUseContextFMHAForGeneration (C++ member)
tensorrt_llm::runtime::GptModelConfig::mUseCustomAllReduce (C++ member)
tensorrt_llm::runtime::GptModelConfig::mUseGptAttentionPlugin (C++ member)
tensorrt_llm::runtime::GptModelConfig::mUseLoraPlugin (C++ member)
tensorrt_llm::runtime::GptModelConfig::mUseMambaConv1dPlugin (C++ member)
tensorrt_llm::runtime::GptModelConfig::mVocabSize (C++ member)
tensorrt_llm::runtime::GptModelConfig::setLoraModules (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMambaConfig (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMaxBatchSize (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMaxBeamWidth (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMaxDraftLen (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMaxInputLen (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMaxLoraRank (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMaxNumTokens (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMaxPromptEmbeddingTableSize (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMaxSequenceLen (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMedusaModule (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMlpHiddenSize (C++ function)
tensorrt_llm::runtime::GptModelConfig::setModelVariant (C++ function)
tensorrt_llm::runtime::GptModelConfig::setNbKvHeads (C++ function)
tensorrt_llm::runtime::GptModelConfig::setPagedContextFMHA (C++ function)
tensorrt_llm::runtime::GptModelConfig::setQuantMode (C++ function)
tensorrt_llm::runtime::GptModelConfig::setSizePerHead (C++ function)
tensorrt_llm::runtime::GptModelConfig::setTokensPerBlock (C++ function)
tensorrt_llm::runtime::GptModelConfig::setUseContextFMHAForGeneration (C++ function)
tensorrt_llm::runtime::GptModelConfig::supportsInflightBatching (C++ function)
tensorrt_llm::runtime::GptModelConfig::useCustomAllReduce (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::useGptAttentionPlugin (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::useLoraPlugin (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::useMambaConv1dPlugin (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::useMedusa (C++ function)
tensorrt_llm::runtime::GptModelConfig::usePackedInput (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::usePagedKvCache (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::usePagedState (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::usePromptTuning (C++ function)
tensorrt_llm::runtime::GptSession (C++ class)
tensorrt_llm::runtime::GptSession::Config (C++ class)
tensorrt_llm::runtime::GptSession::Config::Config (C++ function)
tensorrt_llm::runtime::GptSession::Config::ctxMicroBatchSize (C++ member)
tensorrt_llm::runtime::GptSession::Config::cudaGraphMode (C++ member)
tensorrt_llm::runtime::GptSession::Config::decoderPerRequest (C++ member)
tensorrt_llm::runtime::GptSession::Config::decodingMode (C++ member)
tensorrt_llm::runtime::GptSession::Config::genMicroBatchSize (C++ member)
tensorrt_llm::runtime::GptSession::Config::kvCacheConfig (C++ member)
tensorrt_llm::runtime::GptSession::Config::maxBatchSize (C++ member)
tensorrt_llm::runtime::GptSession::Config::maxBeamWidth (C++ member)
tensorrt_llm::runtime::GptSession::Config::maxSequenceLength (C++ member)
tensorrt_llm::runtime::GptSession::Config::normalizeLogProbs (C++ member)
tensorrt_llm::runtime::GptSession::createBuffers (C++ function)
tensorrt_llm::runtime::GptSession::createContexts (C++ function)
tensorrt_llm::runtime::GptSession::createCustomAllReduceWorkspace (C++ function)
tensorrt_llm::runtime::GptSession::createDecoders (C++ function)
tensorrt_llm::runtime::GptSession::createKvCacheManager (C++ function)
tensorrt_llm::runtime::GptSession::createOnTokenGeneratedCallback (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor (C++ class)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::clear (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::create (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::CudaGraphExecutor (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::hasInstance (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::launch (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::mInstance (C++ member)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::prepareNextGraph (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::update (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::uploadToStream (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::~CudaGraphExecutor (C++ function)
tensorrt_llm::runtime::GptSession::decoderStepAsync (C++ function)
tensorrt_llm::runtime::GptSession::executeContextStep (C++ function)
tensorrt_llm::runtime::GptSession::executeGenerationStep (C++ function)
tensorrt_llm::runtime::GptSession::finalize (C++ function)
tensorrt_llm::runtime::GptSession::generate (C++ function)
tensorrt_llm::runtime::GptSession::generateBatched (C++ function)
tensorrt_llm::runtime::GptSession::GenerationProfiler (C++ class)
tensorrt_llm::runtime::GptSession::GenerationProfiler::end (C++ member)
tensorrt_llm::runtime::GptSession::GenerationProfiler::flags (C++ member)
tensorrt_llm::runtime::GptSession::GenerationProfiler::GenerationProfiler (C++ function)
tensorrt_llm::runtime::GptSession::GenerationProfiler::getElapsedTimeMs (C++ function)
tensorrt_llm::runtime::GptSession::GenerationProfiler::getEnd (C++ function)
tensorrt_llm::runtime::GptSession::GenerationProfiler::getStart (C++ function)
tensorrt_llm::runtime::GptSession::GenerationProfiler::start (C++ member)
tensorrt_llm::runtime::GptSession::getBufferManager (C++ function)
tensorrt_llm::runtime::GptSession::getDevice (C++ function)
tensorrt_llm::runtime::GptSession::getLogger (C++ function)
tensorrt_llm::runtime::GptSession::getLogitDataType (C++ function)
tensorrt_llm::runtime::GptSession::getModelConfig (C++ function)
tensorrt_llm::runtime::GptSession::getNormalizeLogProbs (C++ function)
tensorrt_llm::runtime::GptSession::getWorldConfig (C++ function)
tensorrt_llm::runtime::GptSession::GptSession (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::GptSession::initDecoder (C++ function)
tensorrt_llm::runtime::GptSession::kvCacheAddSequences (C++ function)
tensorrt_llm::runtime::GptSession::KvCacheConfig (C++ type)
tensorrt_llm::runtime::GptSession::KvCacheManager (C++ type)
tensorrt_llm::runtime::GptSession::LoggerPtr (C++ type)
tensorrt_llm::runtime::GptSession::mBuffers (C++ member)
tensorrt_llm::runtime::GptSession::mCommEvent (C++ member)
tensorrt_llm::runtime::GptSession::mCommPtrs (C++ member)
tensorrt_llm::runtime::GptSession::mCommStream (C++ member)
tensorrt_llm::runtime::GptSession::mCudaGraphInstances (C++ member)
tensorrt_llm::runtime::GptSession::mCudaGraphMode (C++ member)
tensorrt_llm::runtime::GptSession::mDecoderMaxAttentionWindow (C++ member)
tensorrt_llm::runtime::GptSession::mDecoderMaxSequenceLength (C++ member)
tensorrt_llm::runtime::GptSession::mDecoders (C++ member)
tensorrt_llm::runtime::GptSession::mDecoderSinkTokenLength (C++ member)
tensorrt_llm::runtime::GptSession::mDevice (C++ member)
tensorrt_llm::runtime::GptSession::MicroBatchConfig (C++ class)
tensorrt_llm::runtime::GptSession::MicroBatchConfig::ctxBatchSize (C++ member)
tensorrt_llm::runtime::GptSession::MicroBatchConfig::genBatchSize (C++ member)
tensorrt_llm::runtime::GptSession::MicroBatchConfig::getGenGraphId (C++ function)
tensorrt_llm::runtime::GptSession::MicroBatchConfig::MicroBatchConfig (C++ function)
,
[1]
tensorrt_llm::runtime::GptSession::MicroBatchConfig::numCtxBatches (C++ member)
tensorrt_llm::runtime::GptSession::MicroBatchConfig::numCtxPerGen (C++ function)
tensorrt_llm::runtime::GptSession::MicroBatchConfig::numGenBatches (C++ member)
tensorrt_llm::runtime::GptSession::mIpcMemoryHandles (C++ member)
tensorrt_llm::runtime::GptSession::mKvCacheManager (C++ member)
tensorrt_llm::runtime::GptSession::mLogger (C++ member)
tensorrt_llm::runtime::GptSession::mMicroBatchConfig (C++ member)
tensorrt_llm::runtime::GptSession::mModelConfig (C++ member)
tensorrt_llm::runtime::GptSession::mNormalizeLogProbs (C++ member)
tensorrt_llm::runtime::GptSession::mPipelineComm (C++ member)
tensorrt_llm::runtime::GptSession::mReceivedEvents (C++ member)
tensorrt_llm::runtime::GptSession::mRuntime (C++ member)
tensorrt_llm::runtime::GptSession::mWorldConfig (C++ member)
tensorrt_llm::runtime::GptSession::setup (C++ function)
tensorrt_llm::runtime::GptSession::shouldStopSync (C++ function)
tensorrt_llm::runtime::GptSession::TensorPtr (C++ type)
tensorrt_llm::runtime::GptSession::TokenGeneratedCallback (C++ type)
tensorrt_llm::runtime::GptSession::useCudaGraphs (C++ function)
tensorrt_llm::runtime::IBuffer (C++ class)
tensorrt_llm::runtime::IBuffer::data (C++ function)
,
[1]
,
[2]
,
[3]
tensorrt_llm::runtime::IBuffer::DataType (C++ type)
tensorrt_llm::runtime::IBuffer::getCapacity (C++ function)
tensorrt_llm::runtime::IBuffer::getDataType (C++ function)
tensorrt_llm::runtime::IBuffer::getDataTypeName (C++ function)
tensorrt_llm::runtime::IBuffer::getMemoryType (C++ function)
tensorrt_llm::runtime::IBuffer::getMemoryTypeName (C++ function)
tensorrt_llm::runtime::IBuffer::getSize (C++ function)
tensorrt_llm::runtime::IBuffer::getSizeInBytes (C++ function)
tensorrt_llm::runtime::IBuffer::IBuffer (C++ function)
,
[1]
tensorrt_llm::runtime::IBuffer::memoryType (C++ function)
tensorrt_llm::runtime::IBuffer::operator= (C++ function)
tensorrt_llm::runtime::IBuffer::release (C++ function)
tensorrt_llm::runtime::IBuffer::resize (C++ function)
tensorrt_llm::runtime::IBuffer::SharedConstPtr (C++ type)
tensorrt_llm::runtime::IBuffer::SharedPtr (C++ type)
tensorrt_llm::runtime::IBuffer::slice (C++ function)
,
[1]
,
[2]
,
[3]
tensorrt_llm::runtime::IBuffer::toBytes (C++ function)
tensorrt_llm::runtime::IBuffer::UniqueConstPtr (C++ type)
tensorrt_llm::runtime::IBuffer::UniquePtr (C++ type)
tensorrt_llm::runtime::IBuffer::view (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::IBuffer::wrap (C++ function)
,
[1]
,
[2]
,
[3]
,
[4]
tensorrt_llm::runtime::IBuffer::~IBuffer (C++ function)
tensorrt_llm::runtime::IGptDecoder (C++ class)
tensorrt_llm::runtime::IGptDecoder::acceptDraftTokensByIds (C++ function)
tensorrt_llm::runtime::IGptDecoder::acceptDraftTokensByLogits (C++ function)
tensorrt_llm::runtime::IGptDecoder::create (C++ function)
tensorrt_llm::runtime::IGptDecoder::forward (C++ function)
tensorrt_llm::runtime::IGptDecoder::forwardAsync (C++ function)
tensorrt_llm::runtime::IGptDecoder::gatherTree (C++ function)
tensorrt_llm::runtime::IGptDecoder::getSamplingConfig (C++ function)
tensorrt_llm::runtime::IGptDecoder::setup (C++ function)
tensorrt_llm::runtime::IGptDecoder::TensorPtr (C++ type)
tensorrt_llm::runtime::IGptDecoder::updateKVCacheBasedOnAcceptedTokens (C++ function)
tensorrt_llm::runtime::IGptDecoder::~IGptDecoder (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch (C++ class)
tensorrt_llm::runtime::IGptDecoderBatch::CudaStreamPtr (C++ type)
tensorrt_llm::runtime::IGptDecoderBatch::finalize (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::forward (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::forwardAsync (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::forwardSync (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getCumLogProbs (C++ function)
,
[1]
tensorrt_llm::runtime::IGptDecoderBatch::getFinished (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getLogProbs (C++ function)
,
[1]
tensorrt_llm::runtime::IGptDecoderBatch::getMedusaAcceptedLengthsCumSum (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getMedusaAcceptedPackedPaths (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getNbSteps (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getNextDraftTokens (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getOutputIds (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getParentIds (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::IGptDecoderBatch (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::newRequests (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::TensorPtr (C++ type)
tensorrt_llm::runtime::IGptDecoderBatch::TokenPtr (C++ type)
tensorrt_llm::runtime::IpcMemory (C++ class)
tensorrt_llm::runtime::IpcMemory::allocateIpcMemory (C++ function)
tensorrt_llm::runtime::IpcMemory::destroyIpcMemory (C++ function)
tensorrt_llm::runtime::IpcMemory::FLAGS_SIZE (C++ member)
tensorrt_llm::runtime::IpcMemory::getCommPtrsTensor (C++ function)
tensorrt_llm::runtime::IpcMemory::IpcMemory (C++ function)
tensorrt_llm::runtime::IpcMemory::mBufferPtr (C++ member)
tensorrt_llm::runtime::IpcMemory::mBufferSize (C++ member)
tensorrt_llm::runtime::IpcMemory::mCommPtrs (C++ member)
tensorrt_llm::runtime::IpcMemory::mWorldConfig (C++ member)
tensorrt_llm::runtime::IpcMemory::TensorPtr (C++ type)
tensorrt_llm::runtime::IpcMemory::~IpcMemory (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder (C++ class)
tensorrt_llm::runtime::IStatefulGptDecoder::CudaStreamPtr (C++ type)
tensorrt_llm::runtime::IStatefulGptDecoder::finalize (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::forward (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::forwardAsync (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::forwardSync (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::getAllNewTokens (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::getCumLogProbs (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::getLogProbs (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::getNbFinished (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::getNewTokens (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::getOutputIds (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::IStatefulGptDecoder (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::newBatch (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::setup (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::TensorPtr (C++ type)
tensorrt_llm::runtime::IStatefulGptDecoder::~IStatefulGptDecoder (C++ function)
tensorrt_llm::runtime::ITensor (C++ class)
tensorrt_llm::runtime::ITensor::castSize (C++ function)
tensorrt_llm::runtime::ITensor::DimType (C++ type)
tensorrt_llm::runtime::ITensor::getShape (C++ function)
tensorrt_llm::runtime::ITensor::ITensor (C++ function)
,
[1]
tensorrt_llm::runtime::ITensor::makeShape (C++ function)
tensorrt_llm::runtime::ITensor::operator= (C++ function)
tensorrt_llm::runtime::ITensor::reshape (C++ function)
tensorrt_llm::runtime::ITensor::resize (C++ function)
tensorrt_llm::runtime::ITensor::Shape (C++ type)
tensorrt_llm::runtime::ITensor::shapeEquals (C++ function)
,
[1]
,
[2]
,
[3]
,
[4]
tensorrt_llm::runtime::ITensor::SharedConstPtr (C++ type)
tensorrt_llm::runtime::ITensor::SharedPtr (C++ type)
tensorrt_llm::runtime::ITensor::slice (C++ function)
,
[1]
,
[2]
,
[3]
tensorrt_llm::runtime::ITensor::squeeze (C++ function)
,
[1]
tensorrt_llm::runtime::ITensor::toString (C++ function)
tensorrt_llm::runtime::ITensor::UniqueConstPtr (C++ type)
tensorrt_llm::runtime::ITensor::UniquePtr (C++ type)
tensorrt_llm::runtime::ITensor::unsqueeze (C++ function)
,
[1]
tensorrt_llm::runtime::ITensor::view (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::ITensor::volume (C++ function)
tensorrt_llm::runtime::ITensor::volumeNonNegative (C++ function)
tensorrt_llm::runtime::ITensor::wrap (C++ function)
,
[1]
,
[2]
,
[3]
,
[4]
tensorrt_llm::runtime::ITensor::~ITensor (C++ function)
tensorrt_llm::runtime::LoraCache (C++ class)
tensorrt_llm::runtime::LoraCache::bump (C++ function)
tensorrt_llm::runtime::LoraCache::bumpTaskInProgress (C++ function)
tensorrt_llm::runtime::LoraCache::claimPagesWithEvict (C++ function)
tensorrt_llm::runtime::LoraCache::copyTask (C++ function)
tensorrt_llm::runtime::LoraCache::copyTaskMapPages (C++ function)
tensorrt_llm::runtime::LoraCache::copyToPages (C++ function)
tensorrt_llm::runtime::LoraCache::determineNumPages (C++ function)
,
[1]
tensorrt_llm::runtime::LoraCache::fits (C++ function)
tensorrt_llm::runtime::LoraCache::get (C++ function)
tensorrt_llm::runtime::LoraCache::getNumPages (C++ function)
tensorrt_llm::runtime::LoraCache::getPagePtr (C++ function)
tensorrt_llm::runtime::LoraCache::getStatus (C++ function)
tensorrt_llm::runtime::LoraCache::has (C++ function)
tensorrt_llm::runtime::LoraCache::isDone (C++ function)
tensorrt_llm::runtime::LoraCache::isLoaded (C++ function)
tensorrt_llm::runtime::LoraCache::loadWeights (C++ function)
,
[1]
tensorrt_llm::runtime::LoraCache::LoraCache (C++ function)
tensorrt_llm::runtime::LoraCache::markAllDone (C++ function)
tensorrt_llm::runtime::LoraCache::markTaskDone (C++ function)
tensorrt_llm::runtime::LoraCache::mBufferManager (C++ member)
tensorrt_llm::runtime::LoraCache::mCacheMap (C++ member)
tensorrt_llm::runtime::LoraCache::mCacheMutex (C++ member)
tensorrt_llm::runtime::LoraCache::mCachePageManager (C++ member)
tensorrt_llm::runtime::LoraCache::mDeviceBufferManagers (C++ member)
tensorrt_llm::runtime::LoraCache::mDoneTasks (C++ member)
tensorrt_llm::runtime::LoraCache::mInProgressTasks (C++ member)
tensorrt_llm::runtime::LoraCache::mModelConfig (C++ member)
tensorrt_llm::runtime::LoraCache::mModuleIdToModule (C++ member)
tensorrt_llm::runtime::LoraCache::mPageManagerConfig (C++ member)
tensorrt_llm::runtime::LoraCache::mPagesMutex (C++ member)
tensorrt_llm::runtime::LoraCache::mWorldConfig (C++ member)
tensorrt_llm::runtime::LoraCache::put (C++ function)
tensorrt_llm::runtime::LoraCache::splitTransposeCpu (C++ function)
tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner (C++ function)
tensorrt_llm::runtime::LoraCache::TaskIdType (C++ type)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig (C++ struct)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::adapterSize (C++ member)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::inSize (C++ member)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::layerId (C++ member)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::moduleId (C++ member)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::numSlots (C++ member)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::operator== (C++ function)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::outSize (C++ member)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::pageId (C++ member)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::slotIdx (C++ member)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::toString (C++ function)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::weightsInPointer (C++ member)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::weightsOutPointer (C++ member)
tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfigListPtr (C++ type)
tensorrt_llm::runtime::LoraCache::TaskValue (C++ struct)
tensorrt_llm::runtime::LoraCache::TaskValue::configs (C++ member)
tensorrt_llm::runtime::LoraCache::TaskValue::done (C++ member)
tensorrt_llm::runtime::LoraCache::TaskValue::inProgress (C++ member)
tensorrt_llm::runtime::LoraCache::TaskValue::it (C++ member)
tensorrt_llm::runtime::LoraCache::TaskValue::loaded (C++ member)
tensorrt_llm::runtime::LoraCache::TaskValue::loadInProgress (C++ member)
tensorrt_llm::runtime::LoraCache::TaskValue::operator= (C++ function)
tensorrt_llm::runtime::LoraCache::TaskValue::pageIds (C++ member)
tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::LoraCache::TaskValue::~TaskValue (C++ function)
tensorrt_llm::runtime::LoraCache::TaskValuePtr (C++ type)
tensorrt_llm::runtime::LoraCache::TensorPtr (C++ type)
tensorrt_llm::runtime::LoraCache::ValueStatus (C++ enum)
tensorrt_llm::runtime::LoraCache::ValueStatus::kVALUE_STATUS_LOADED (C++ enumerator)
tensorrt_llm::runtime::LoraCache::ValueStatus::kVALUE_STATUS_MISSING (C++ enumerator)
tensorrt_llm::runtime::LoraCache::ValueStatus::kVALUE_STATUS_PROCESSING (C++ enumerator)
tensorrt_llm::runtime::LoraCachePageManager (C++ class)
tensorrt_llm::runtime::LoraCachePageManager::blockPtr (C++ function)
tensorrt_llm::runtime::LoraCachePageManager::claimPages (C++ function)
tensorrt_llm::runtime::LoraCachePageManager::initialize (C++ function)
tensorrt_llm::runtime::LoraCachePageManager::LoraCachePageManager (C++ function)
tensorrt_llm::runtime::LoraCachePageManager::mConfig (C++ member)
tensorrt_llm::runtime::LoraCachePageManager::mFreePageIds (C++ member)
tensorrt_llm::runtime::LoraCachePageManager::mIsPageFree (C++ member)
tensorrt_llm::runtime::LoraCachePageManager::mPageBlocks (C++ member)
tensorrt_llm::runtime::LoraCachePageManager::mutablePagePtr (C++ function)
tensorrt_llm::runtime::LoraCachePageManager::numAvailablePages (C++ function)
tensorrt_llm::runtime::LoraCachePageManager::pagePtr (C++ function)
tensorrt_llm::runtime::LoraCachePageManager::releasePages (C++ function)
tensorrt_llm::runtime::LoraCachePageManager::TensorPtr (C++ type)
tensorrt_llm::runtime::LoraCachePageManagerConfig (C++ class)
tensorrt_llm::runtime::LoraCachePageManagerConfig::getDataType (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::getInitToZero (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::getMaxPagesPerBlock (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::getMemoryType (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::getNumCopyStreams (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::getPageWidth (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::getSlotsPerPage (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::getTotalNumPages (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::mDataType (C++ member)
tensorrt_llm::runtime::LoraCachePageManagerConfig::mInitToZero (C++ member)
tensorrt_llm::runtime::LoraCachePageManagerConfig::mMaxPagesPerBlock (C++ member)
tensorrt_llm::runtime::LoraCachePageManagerConfig::mMemoryType (C++ member)
tensorrt_llm::runtime::LoraCachePageManagerConfig::mNumCopyStreams (C++ member)
tensorrt_llm::runtime::LoraCachePageManagerConfig::mPageWidth (C++ member)
tensorrt_llm::runtime::LoraCachePageManagerConfig::mSlotsPerPage (C++ member)
tensorrt_llm::runtime::LoraCachePageManagerConfig::mTotalNumPages (C++ member)
tensorrt_llm::runtime::LoraCachePageManagerConfig::setDataType (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::setInitToZero (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::setMaxPagesPerBlock (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::setMemoryType (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::setNumCopyStreams (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::setPageWidth (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::setSlotsPerPage (C++ function)
tensorrt_llm::runtime::LoraCachePageManagerConfig::setTotalNumPage (C++ function)
tensorrt_llm::runtime::LoraModule (C++ class)
tensorrt_llm::runtime::LoraModule::createLoraModules (C++ function)
tensorrt_llm::runtime::LoraModule::flattenedInOutSize (C++ function)
tensorrt_llm::runtime::LoraModule::inDim (C++ function)
tensorrt_llm::runtime::LoraModule::inDimFirst (C++ function)
tensorrt_llm::runtime::LoraModule::inSize (C++ function)
tensorrt_llm::runtime::LoraModule::inTpSplitDim (C++ function)
tensorrt_llm::runtime::LoraModule::localInAdapterSize (C++ function)
tensorrt_llm::runtime::LoraModule::localInDim (C++ function)
tensorrt_llm::runtime::LoraModule::localInOutSize (C++ function)
tensorrt_llm::runtime::LoraModule::localInSize (C++ function)
tensorrt_llm::runtime::LoraModule::localOutAdapterSize (C++ function)
tensorrt_llm::runtime::LoraModule::localOutDim (C++ function)
tensorrt_llm::runtime::LoraModule::localOutSize (C++ function)
tensorrt_llm::runtime::LoraModule::LoraModule (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::LoraModule::mInDim (C++ member)
tensorrt_llm::runtime::LoraModule::mInDimFirst (C++ member)
tensorrt_llm::runtime::LoraModule::mInTpSplitDim (C++ member)
tensorrt_llm::runtime::LoraModule::ModuleType (C++ enum)
tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_DENSE (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_K (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_Q (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_QKV (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_V (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_DENSE (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_K (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_Q (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_QKV (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_V (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kINVALID (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_4H_TO_H (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_GATE (C++ enumerator)
tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_H_TO_4H (C++ enumerator)
tensorrt_llm::runtime::LoraModule::mOutDim (C++ member)
tensorrt_llm::runtime::LoraModule::mOutDimFirst (C++ member)
tensorrt_llm::runtime::LoraModule::mOutTpSplitDim (C++ member)
tensorrt_llm::runtime::LoraModule::mType (C++ member)
tensorrt_llm::runtime::LoraModule::name (C++ function)
tensorrt_llm::runtime::LoraModule::operator= (C++ function)
tensorrt_llm::runtime::LoraModule::outDim (C++ function)
tensorrt_llm::runtime::LoraModule::outDimFirst (C++ function)
tensorrt_llm::runtime::LoraModule::outSize (C++ function)
tensorrt_llm::runtime::LoraModule::outTpSplitDim (C++ function)
tensorrt_llm::runtime::LoraModule::TensorPtr (C++ type)
tensorrt_llm::runtime::LoraModule::toModuleName (C++ function)
,
[1]
tensorrt_llm::runtime::LoraModule::toModuleType (C++ function)
tensorrt_llm::runtime::LoraModule::value (C++ function)
tensorrt_llm::runtime::MambaConfig (C++ struct)
tensorrt_llm::runtime::MambaConfig::dConv (C++ member)
tensorrt_llm::runtime::MambaConfig::dState (C++ member)
tensorrt_llm::runtime::MambaConfig::expand (C++ member)
tensorrt_llm::runtime::MemoryCounters (C++ class)
tensorrt_llm::runtime::MemoryCounters::allocate (C++ function)
,
[1]
tensorrt_llm::runtime::MemoryCounters::bytesToString (C++ function)
,
[1]
tensorrt_llm::runtime::MemoryCounters::deallocate (C++ function)
,
[1]
tensorrt_llm::runtime::MemoryCounters::DiffType (C++ type)
tensorrt_llm::runtime::MemoryCounters::getCpu (C++ function)
tensorrt_llm::runtime::MemoryCounters::getCpuDiff (C++ function)
tensorrt_llm::runtime::MemoryCounters::getGpu (C++ function)
tensorrt_llm::runtime::MemoryCounters::getGpuDiff (C++ function)
tensorrt_llm::runtime::MemoryCounters::getInstance (C++ function)
tensorrt_llm::runtime::MemoryCounters::getPinned (C++ function)
tensorrt_llm::runtime::MemoryCounters::getPinnedDiff (C++ function)
tensorrt_llm::runtime::MemoryCounters::getUVM (C++ function)
tensorrt_llm::runtime::MemoryCounters::getUVMDiff (C++ function)
tensorrt_llm::runtime::MemoryCounters::mCpu (C++ member)
tensorrt_llm::runtime::MemoryCounters::mCpuDiff (C++ member)
tensorrt_llm::runtime::MemoryCounters::MemoryCounters (C++ function)
tensorrt_llm::runtime::MemoryCounters::mGpu (C++ member)
tensorrt_llm::runtime::MemoryCounters::mGpuDiff (C++ member)
tensorrt_llm::runtime::MemoryCounters::mPinned (C++ member)
tensorrt_llm::runtime::MemoryCounters::mPinnedDiff (C++ member)
tensorrt_llm::runtime::MemoryCounters::mUVM (C++ member)
tensorrt_llm::runtime::MemoryCounters::mUVMDiff (C++ member)
tensorrt_llm::runtime::MemoryCounters::SizeType (C++ type)
tensorrt_llm::runtime::MemoryCounters::toString (C++ function)
tensorrt_llm::runtime::MemoryType (C++ enum)
tensorrt_llm::runtime::MemoryType::kCPU (C++ enumerator)
tensorrt_llm::runtime::MemoryType::kGPU (C++ enumerator)
tensorrt_llm::runtime::MemoryType::kPINNED (C++ enumerator)
tensorrt_llm::runtime::MemoryType::kUVM (C++ enumerator)
tensorrt_llm::runtime::MemoryTypeString (C++ struct)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kCPU> (C++ struct)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kCPU>::value (C++ member)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kGPU> (C++ struct)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kGPU>::value (C++ member)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kPINNED> (C++ struct)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kPINNED>::value (C++ member)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kUVM> (C++ struct)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kUVM>::value (C++ member)
tensorrt_llm::runtime::operator<< (C++ function)
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
tensorrt_llm::runtime::PhonyNameDueToError::name (C++ member)
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
,
[6]
,
[7]
,
[8]
tensorrt_llm::runtime::PhonyNameDueToError::size (C++ member)
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
,
[6]
,
[7]
,
[8]
tensorrt_llm::runtime::PhonyNameDueToError::type (C++ type)
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
,
[6]
,
[7]
,
[8]
tensorrt_llm::runtime::PhonyNameDueToError::value (C++ member)
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
,
[6]
,
[7]
,
[8]
,
[9]
tensorrt_llm::runtime::PointerElementType (C++ type)
tensorrt_llm::runtime::PromptTuningParams (C++ class)
tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor (C++ function)
tensorrt_llm::runtime::PromptTuningParams::PromptTuningParams (C++ function)
tensorrt_llm::runtime::PromptTuningParams::SizeType (C++ type)
tensorrt_llm::runtime::PromptTuningParams::TensorPtr (C++ type)
tensorrt_llm::runtime::SamplingConfig (C++ class)
tensorrt_llm::runtime::SamplingConfig::beamSearchDiversityRate (C++ member)
tensorrt_llm::runtime::SamplingConfig::beamWidth (C++ member)
tensorrt_llm::runtime::SamplingConfig::draftAcceptanceThreshold (C++ member)
tensorrt_llm::runtime::SamplingConfig::earlyStopping (C++ member)
tensorrt_llm::runtime::SamplingConfig::FloatType (C++ type)
tensorrt_llm::runtime::SamplingConfig::frequencyPenalty (C++ member)
tensorrt_llm::runtime::SamplingConfig::fuseValues (C++ function)
tensorrt_llm::runtime::SamplingConfig::lengthPenalty (C++ member)
tensorrt_llm::runtime::SamplingConfig::minLength (C++ member)
tensorrt_llm::runtime::SamplingConfig::normalizeLogProbs (C++ member)
tensorrt_llm::runtime::SamplingConfig::operator== (C++ function)
tensorrt_llm::runtime::SamplingConfig::OptVec (C++ type)
tensorrt_llm::runtime::SamplingConfig::presencePenalty (C++ member)
tensorrt_llm::runtime::SamplingConfig::randomSeed (C++ member)
tensorrt_llm::runtime::SamplingConfig::repetitionPenalty (C++ member)
tensorrt_llm::runtime::SamplingConfig::SamplingConfig (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::SamplingConfig::temperature (C++ member)
tensorrt_llm::runtime::SamplingConfig::topK (C++ member)
tensorrt_llm::runtime::SamplingConfig::topKMedusaHeads (C++ member)
tensorrt_llm::runtime::SamplingConfig::topP (C++ member)
tensorrt_llm::runtime::SamplingConfig::topPDecay (C++ member)
tensorrt_llm::runtime::SamplingConfig::topPMin (C++ member)
tensorrt_llm::runtime::SamplingConfig::topPResetIds (C++ member)
tensorrt_llm::runtime::SamplingConfig::Vec (C++ type)
tensorrt_llm::runtime::setPeerAccess (C++ function)
tensorrt_llm::runtime::SizeType (C++ type)
tensorrt_llm::runtime::StringPtrMap (C++ type)
tensorrt_llm::runtime::TllmLogger (C++ class)
tensorrt_llm::runtime::TllmLogger::getLevel (C++ function)
tensorrt_llm::runtime::TllmLogger::log (C++ function)
tensorrt_llm::runtime::TllmLogger::setLevel (C++ function)
tensorrt_llm::runtime::to_string (C++ function)
,
[1]
tensorrt_llm::runtime::TokenIdType (C++ type)
tensorrt_llm::runtime::TRTDataType (C++ struct)
tensorrt_llm::runtime::TRTDataType<bool> (C++ struct)
tensorrt_llm::runtime::TRTDataType<bool>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<float> (C++ struct)
tensorrt_llm::runtime::TRTDataType<float>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<half> (C++ struct)
tensorrt_llm::runtime::TRTDataType<half>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<std::int32_t> (C++ struct)
tensorrt_llm::runtime::TRTDataType<std::int32_t>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<std::int64_t> (C++ struct)
tensorrt_llm::runtime::TRTDataType<std::int64_t>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<std::int8_t> (C++ struct)
tensorrt_llm::runtime::TRTDataType<std::int8_t>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<std::uint32_t> (C++ struct)
tensorrt_llm::runtime::TRTDataType<std::uint32_t>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<std::uint64_t> (C++ struct)
tensorrt_llm::runtime::TRTDataType<std::uint64_t>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<std::uint8_t> (C++ struct)
tensorrt_llm::runtime::TRTDataType<std::uint8_t>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<T*> (C++ struct)
tensorrt_llm::runtime::TRTDataType<T*>::kUnderlyingType (C++ member)
tensorrt_llm::runtime::TRTDataType<T*>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<void*> (C++ struct)
tensorrt_llm::runtime::TRTDataType<void*>::value (C++ member)
tensorrt_llm::runtime::utils (C++ type)
tensorrt_llm::runtime::utils::loadEngine (C++ function)
tensorrt_llm::runtime::WorldConfig (C++ class)
tensorrt_llm::runtime::WorldConfig::getDevice (C++ function)
tensorrt_llm::runtime::WorldConfig::getGpusPerGroup (C++ function)
tensorrt_llm::runtime::WorldConfig::getGpusPerNode (C++ function)
tensorrt_llm::runtime::WorldConfig::getLastRank (C++ function)
tensorrt_llm::runtime::WorldConfig::getPipelineParallelGroup (C++ function)
tensorrt_llm::runtime::WorldConfig::getPipelineParallelism (C++ function)
tensorrt_llm::runtime::WorldConfig::getPipelineParallelRank (C++ function)
tensorrt_llm::runtime::WorldConfig::getRank (C++ function)
tensorrt_llm::runtime::WorldConfig::getSize (C++ function)
tensorrt_llm::runtime::WorldConfig::getTensorParallelism (C++ function)
tensorrt_llm::runtime::WorldConfig::getTensorParallelRank (C++ function)
tensorrt_llm::runtime::WorldConfig::isFirstPipelineParallelRank (C++ function)
tensorrt_llm::runtime::WorldConfig::isLastPipelineParallelRank (C++ function)
tensorrt_llm::runtime::WorldConfig::isPipelineParallel (C++ function)
tensorrt_llm::runtime::WorldConfig::isTensorParallel (C++ function)
tensorrt_llm::runtime::WorldConfig::kDefaultGpusPerNode (C++ member)
tensorrt_llm::runtime::WorldConfig::mDeviceIds (C++ member)
tensorrt_llm::runtime::WorldConfig::mGpusPerNode (C++ member)
tensorrt_llm::runtime::WorldConfig::mpi (C++ function)
tensorrt_llm::runtime::WorldConfig::mPipelineParallelism (C++ member)
tensorrt_llm::runtime::WorldConfig::mRank (C++ member)
tensorrt_llm::runtime::WorldConfig::mTensorParallelism (C++ member)
tensorrt_llm::runtime::WorldConfig::validMpiConfig (C++ function)
tensorrt_llm::runtime::WorldConfig::WorldConfig (C++ function)
to_dict() (tensorrt_llm.models.PretrainedConfig method)
to_legacy_setting() (tensorrt_llm.plugin.PluginConfig method)
to_word_list_format() (in module tensorrt_llm.runtime)
tokens_per_block (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
top_k (tensorrt_llm.runtime.SamplingConfig attribute)
top_p (tensorrt_llm.runtime.SamplingConfig attribute)
top_p_decay (tensorrt_llm.runtime.SamplingConfig attribute)
top_p_min (tensorrt_llm.runtime.SamplingConfig attribute)
top_p_reset_ids (tensorrt_llm.runtime.SamplingConfig attribute)
topk() (in module tensorrt_llm.functional)
transpose() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
trtllm_modules_to_hf_modules (tensorrt_llm.runtime.ModelConfig attribute)
TWOSHOT (tensorrt_llm.functional.AllReduceStrategy attribute)
U
unary() (in module tensorrt_llm.functional)
unsqueeze() (in module tensorrt_llm.functional)
update() (tensorrt_llm.runtime.SamplingConfig method)
update_kv_cache_draft_token_location() (tensorrt_llm.runtime.GenerationSession method)
update_output_ids_by_offset() (tensorrt_llm.runtime.GenerationSession method)
use_beam_hyps (tensorrt_llm.runtime.SamplingConfig attribute)
use_context_fmha_for_generation (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
use_custom_all_reduce (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
use_gpt_attention_plugin (tensorrt_llm.runtime.GenerationSession property)
use_lora() (tensorrt_llm.models.GPTForCausalLM method)
(tensorrt_llm.models.LLaMAForCausalLM method)
use_lora_plugin (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelRunner property)
use_mamba_conv1d_plugin (tensorrt_llm.runtime.GenerationSession property)
USE_MEMCPY (tensorrt_llm.functional.AllReduceConfig attribute)
V
view() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
vocab_size (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
(tensorrt_llm.runtime.ModelRunner property)
(tensorrt_llm.runtime.ModelRunnerCpp property)
vocab_size_padded (tensorrt_llm.runtime.ModelRunner property)
(tensorrt_llm.runtime.ModelRunnerCpp property)
W
weight_loader() (tensorrt_llm.layers.embedding.Embedding method)
(tensorrt_llm.layers.linear.Linear method)
(tensorrt_llm.layers.linear.ParallelLMHead method)
(tensorrt_llm.layers.linear.QKVColumnLinear method)
(tensorrt_llm.layers.linear.RowLinear method)
where() (in module tensorrt_llm.functional)
WhisperEncoder (class in tensorrt_llm.models)