trtllm-build
usage: trtllm-build [-h] [--checkpoint_dir CHECKPOINT_DIR] [--model_config MODEL_CONFIG]
[--build_config BUILD_CONFIG] [--model_cls_file MODEL_CLS_FILE]
[--model_cls_name MODEL_CLS_NAME] [--output_dir OUTPUT_DIR] [--max_batch_size MAX_BATCH_SIZE]
[--max_input_len MAX_INPUT_LEN] [--max_seq_len MAX_SEQ_LEN] [--max_beam_width MAX_BEAM_WIDTH]
[--max_num_tokens MAX_NUM_TOKENS] [--opt_num_tokens OPT_NUM_TOKENS]
[--max_encoder_input_len MAX_ENCODER_INPUT_LEN]
[--max_prompt_embedding_table_size MAX_PROMPT_EMBEDDING_TABLE_SIZE]
[--kv_cache_type KV_CACHE_TYPE] [--paged_kv_cache PAGED_KV_CACHE]
[--input_timing_cache INPUT_TIMING_CACHE] [--output_timing_cache OUTPUT_TIMING_CACHE]
[--profiling_verbosity {layer_names_only,detailed,none}] [--strip_plan] [--weight_sparsity]
[--weight_streaming] [--fast_build] [--workers WORKERS]
[--log_level {internal_error,error,warning,info,verbose,debug}] [--enable_debug_output]
[--visualize_network] [--dry_run] [--monitor_memory] [--logits_dtype {float16,float32}]
[--gather_context_logits] [--gather_generation_logits] [--gather_all_token_logits]
[--lora_dir LORA_DIR [LORA_DIR ...]] [--lora_ckpt_source {hf,nemo}]
[--lora_target_modules {attn_qkv,attn_q,attn_k,attn_v,attn_dense,mlp_h_to_4h,mlp_4h_to_h,mlp_gate,cross_attn_qkv,cross_attn_q,cross_attn_k,cross_attn_v,cross_attn_dense,moe_h_to_4h,moe_4h_to_h,moe_gate,moe_router,mlp_router} [{attn_qkv,attn_q,attn_k,attn_v,attn_dense,mlp_h_to_4h,mlp_4h_to_h,mlp_gate,cross_attn_qkv,cross_attn_q,cross_attn_k,cross_attn_v,cross_attn_dense,moe_h_to_4h,moe_4h_to_h,moe_gate,moe_router,mlp_router} ...]]
[--max_lora_rank MAX_LORA_RANK]
[--speculative_decoding_mode {draft_tokens_external,lookahead_decoding,medusa,explicit_draft_tokens,eagle}]
[--max_draft_len MAX_DRAFT_LEN] [--auto_parallel AUTO_PARALLEL]
[--gpus_per_node GPUS_PER_NODE]
[--cluster_key {A100-SXM-80GB,A100-SXM-40GB,A100-PCIe-80GB,A100-PCIe-40GB,H100-SXM,H100-PCIe,H20,V100-PCIe-16GB,V100-PCIe-32GB,V100-SXM-16GB,V100-SXM-32GB,V100S-PCIe,A40,A30,A10,A10G,L40S,L40,L20,L4,L2}]
[--bert_attention_plugin {auto,float16,float32,bfloat16,int32,disable}]
[--gpt_attention_plugin {auto,float16,float32,bfloat16,int32,disable}]
[--gemm_plugin {auto,float16,float32,bfloat16,int32,fp8,disable}]
[--gemm_swiglu_plugin {fp8,disable}]
[--fp8_rowwise_gemm_plugin {auto,float16,float32,bfloat16,int32,disable}]
[--nccl_plugin {auto,float16,float32,bfloat16,int32,disable}]
[--lora_plugin {auto,float16,float32,bfloat16,int32,disable}]
[--moe_plugin {auto,float16,float32,bfloat16,int32,disable}]
[--mamba_conv1d_plugin {auto,float16,float32,bfloat16,int32,disable}]
[--low_latency_gemm_plugin {fp8,disable}] [--context_fmha {enable,disable}]
[--bert_context_fmha_fp32_acc {enable,disable}] [--remove_input_padding {enable,disable}]
[--reduce_fusion {enable,disable}] [--enable_xqa {enable,disable}]
[--tokens_per_block TOKENS_PER_BLOCK] [--use_paged_context_fmha {enable,disable}]
[--use_fp8_context_fmha {enable,disable}] [--multiple_profiles {enable,disable}]
[--paged_state {enable,disable}] [--streamingllm {enable,disable}]
[--use_fused_mlp {enable,disable}] [--pp_reduce_scatter {enable,disable}]
Named Arguments
- --checkpoint_dir
The directory path that contains TensorRT-LLM checkpoint.
- --model_config
The file path that saves TensorRT-LLM checkpoint config.
- --build_config
The file path that saves TensorRT-LLM build config.
- --model_cls_file
The file path that defines customized TensorRT-LLM model.
- --model_cls_name
The customized TensorRT-LLM model class name.
- --output_dir
The directory path to save the serialized engine files and engine config file.
Default:
'engine_outputs'
- --max_batch_size
Maximum number of requests that the engine can schedule.
Default:
2048
- --max_input_len
Maximum input length of one request.
Default:
1024
- --max_seq_len, --max_decoder_seq_len
Maximum total length of one request, including prompt and outputs. If unspecified, the value is deduced from the model config.
- --max_beam_width
Maximum number of beams for beam search decoding.
Default:
1
- --max_num_tokens
Maximum number of batched input tokens after padding is removed in each batch. Currently, the input padding is removed by default; you may explicitly disable it by specifying
--remove_input_padding disable
.Default:
8192
- --opt_num_tokens
Optimal number of batched input tokens after padding is removed in each batch It equals to
max_batch_size * max_beam_width
by default, set this value as close as possible to the actual number of tokens on your workload. Note that this argument might be removed in the future.- --max_encoder_input_len
Maximum encoder input length for enc-dec models. Set
max_input_len
to 1 to start generation from decoder_start_token_id of length 1.Default:
1024
- --max_prompt_embedding_table_size, --max_multimodal_len
Maximum prompt embedding table size for prompt tuning, or maximum multimodal input size for multimodal models. Setting a value > 0 enables prompt tuning or multimodal input.
Default:
0
- --kv_cache_type
Set KV cache type (continuous, paged, or disabled). For disabled case, KV cache is disabled and only context phase is allowed.
- --paged_kv_cache
Deprecated. Enabling this option is equvilient to
--kv_cache_type paged
for transformer based models.- --input_timing_cache
The file path to read the timing cache. This option is ignored if the file does not exist.
- --output_timing_cache
The file path to write the timing cache.
Default:
'model.cache'
- --profiling_verbosity
Possible choices: layer_names_only, detailed, none
The profiling verbosity for the generated TensorRT engine. Setting to detailed allows inspecting tactic choices and kernel parameters.
Default:
'layer_names_only'
- --strip_plan
Enable stripping weights from the final TensorRT engine under the assumption that the refit weights are identical to those provided at build time.
Default:
False
- --weight_sparsity
Enable weight sparsity.
Default:
False
- --weight_streaming
Enable offloading weights to CPU and streaming loading at runtime.
Default:
False
- --fast_build
Enable features for faster engine building. This may cause some performance degradation and is currently incompatible with int8/int4 quantization.
Default:
False
- --workers
The number of workers for building in parallel.
Default:
1
- --log_level
Possible choices: internal_error, error, warning, info, verbose, debug
The logging level.
Default:
'info'
- --enable_debug_output
Enable debug output.
Default:
False
- --visualize_network
Export TensorRT Networks to ONNX prior to Engine build for debugging.
Default:
False
- --dry_run
Run through the build process except the actual Engine build for debugging.
Default:
False
- --monitor_memory
Enable memory monitor during Engine build.
Default:
False
Logits arguments
- --logits_dtype
Possible choices: float16, float32
The data type of logits.
- --gather_context_logits
Enable gathering context logits.
Default:
False
- --gather_generation_logits
Enable gathering generation logits.
Default:
False
- --gather_all_token_logits
Enable both
gather_context_logits
andgather_generation_logits
.Default:
False
LoRA arguments
- --lora_dir
The directory of LoRA weights. If multiple directories are provided, the first one is used for configuration.
- --lora_ckpt_source
Possible choices: hf, nemo
The source type of LoRA checkpoint.
Default:
'hf'
- --lora_target_modules
Possible choices: attn_qkv, attn_q, attn_k, attn_v, attn_dense, mlp_h_to_4h, mlp_4h_to_h, mlp_gate, cross_attn_qkv, cross_attn_q, cross_attn_k, cross_attn_v, cross_attn_dense, moe_h_to_4h, moe_4h_to_h, moe_gate, moe_router, mlp_router
The target module names that LoRA is applied. Only effective when
lora_plugin
is enabled.- --max_lora_rank
Maximum LoRA rank for different LoRA modules. It is used to compute the workspace size of LoRA plugin.
Default:
64
Speculative decoding arguments
- --speculative_decoding_mode
Possible choices: draft_tokens_external, lookahead_decoding, medusa, explicit_draft_tokens, eagle
Mode of speculative decoding.
- --max_draft_len
Maximum lengths of draft tokens for speculative decoding target model.
Default:
0
Auto parallel arguments
- --auto_parallel
MPI world size for auto parallel.
Default:
1
- --gpus_per_node
Number of GPUs each node has in a multi-node setup. This is a cluster spec and can be greater/smaller than world size. This option is only used for auto parallel specified with
--auto_parallel
.Default:
8
- --cluster_key
Possible choices: A100-SXM-80GB, A100-SXM-40GB, A100-PCIe-80GB, A100-PCIe-40GB, H100-SXM, H100-PCIe, H20, V100-PCIe-16GB, V100-PCIe-32GB, V100-SXM-16GB, V100-SXM-32GB, V100S-PCIe, A40, A30, A10, A10G, L40S, L40, L20, L4, L2
Unique name for target GPU type. Inferred from current GPU type if not specified. This option is only used for auto parallel specified with
--auto_parallel
.
Plugin config arguments
- --bert_attention_plugin
Possible choices: auto, float16, float32, bfloat16, int32, disable
Whether to enable/disable
bert_attention_plugin
and the dtype.Default:
'auto'
- --gpt_attention_plugin
Possible choices: auto, float16, float32, bfloat16, int32, disable
Whether to enable/disable
gpt_attention_plugin
and the dtype.Default:
'auto'
- --gemm_plugin
Possible choices: auto, float16, float32, bfloat16, int32, fp8, disable
Whether to enable/disable
gemm_plugin
and the dtype.Default:
'disable'
- --gemm_swiglu_plugin
Possible choices: fp8, disable
Whether to enable/disable
gemm_swiglu_plugin
and the dtype.Default:
'disable'
- --fp8_rowwise_gemm_plugin
Possible choices: auto, float16, float32, bfloat16, int32, disable
Whether to enable/disable
fp8_rowwise_gemm_plugin
and the dtype.Default:
'disable'
- --nccl_plugin
Possible choices: auto, float16, float32, bfloat16, int32, disable
Whether to enable/disable
nccl_plugin
and the dtype.Default:
'auto'
- --lora_plugin
Possible choices: auto, float16, float32, bfloat16, int32, disable
Whether to enable/disable
lora_plugin
and the dtype.Default:
'disable'
- --moe_plugin
Possible choices: auto, float16, float32, bfloat16, int32, disable
Whether to enable/disable
moe_plugin
and the dtype.Default:
'auto'
- --mamba_conv1d_plugin
Possible choices: auto, float16, float32, bfloat16, int32, disable
Whether to enable/disable
mamba_conv1d_plugin
and the dtype.Default:
'auto'
- --low_latency_gemm_plugin
Possible choices: fp8, disable
Whether to enable/disable
low_latency_gemm_plugin
and the dtype.Default:
'disable'
- --context_fmha
Possible choices: enable, disable
Whether to enable/disable
context_fmha
.Default:
'enable'
- --bert_context_fmha_fp32_acc
Possible choices: enable, disable
Whether to enable/disable
bert_context_fmha_fp32_acc
.Default:
'disable'
- --remove_input_padding
Possible choices: enable, disable
Whether to enable/disable
remove_input_padding
.Default:
'enable'
- --reduce_fusion
Possible choices: enable, disable
Whether to enable/disable
reduce_fusion
.Default:
'disable'
- --enable_xqa
Possible choices: enable, disable
Whether to enable/disable
enable_xqa
.Default:
'enable'
- --tokens_per_block
tokens_per_block
.Default:
64
- --use_paged_context_fmha
Possible choices: enable, disable
Whether to enable/disable
use_paged_context_fmha
.Default:
'disable'
- --use_fp8_context_fmha
Possible choices: enable, disable
Whether to enable/disable
use_fp8_context_fmha
.Default:
'disable'
- --multiple_profiles
Possible choices: enable, disable
Whether to enable/disable
multiple_profiles
.Default:
'disable'
- --paged_state
Possible choices: enable, disable
Whether to enable/disable
paged_state
.Default:
'enable'
- --streamingllm
Possible choices: enable, disable
Whether to enable/disable
streamingllm
.Default:
'disable'
- --use_fused_mlp
Possible choices: enable, disable
Whether to enable/disable
use_fused_mlp
.Default:
'enable'
- --pp_reduce_scatter
Possible choices: enable, disable
Whether to enable/disable
pp_reduce_scatter
.Default:
'disable'