backend
|
Optional[str]
|
categorical
|
allowlist |
pytorch, tensorrt, _autodeploy
|
batching_type
|
Optional[tensorrt_llm.llmapi.llm_args.BatchingType]
|
categorical
|
|
STATIC, INFLIGHT
|
build_config.dry_run
|
<class 'bool'>
|
value
|
|
|
build_config.enable_debug_output
|
<class 'bool'>
|
value
|
|
|
build_config.force_num_profiles
|
Optional[int]
|
value
|
|
|
build_config.gather_context_logits
|
<class 'bool'>
|
value
|
|
|
build_config.gather_generation_logits
|
<class 'bool'>
|
value
|
|
|
build_config.kv_cache_type
|
Optional[tensorrt_llm.llmapi.kv_cache_type.KVCacheType]
|
categorical
|
|
continuous, paged, disabled
|
build_config.lora_config.lora_ckpt_source
|
Literal['hf', 'nemo']
|
categorical
|
|
hf, nemo
|
build_config.lora_config.max_cpu_loras
|
Optional[int]
|
value
|
|
|
build_config.lora_config.max_lora_rank
|
<class 'int'>
|
value
|
|
|
build_config.lora_config.max_loras
|
Optional[int]
|
value
|
|
|
build_config.lora_config.swap_gate_up_proj_lora_b_weight
|
<class 'bool'>
|
value
|
|
|
build_config.max_batch_size
|
<class 'int'>
|
value
|
|
|
build_config.max_beam_width
|
<class 'int'>
|
value
|
|
|
build_config.max_draft_len
|
<class 'int'>
|
value
|
|
|
build_config.max_encoder_input_len
|
<class 'int'>
|
value
|
|
|
build_config.max_input_len
|
<class 'int'>
|
value
|
|
|
build_config.max_num_tokens
|
<class 'int'>
|
value
|
|
|
build_config.max_prompt_embedding_table_size
|
<class 'int'>
|
value
|
|
|
build_config.max_seq_len
|
Optional[int]
|
value
|
|
|
build_config.monitor_memory
|
<class 'bool'>
|
value
|
|
|
build_config.opt_batch_size
|
<class 'int'>
|
value
|
|
|
build_config.opt_num_tokens
|
Optional[int]
|
value
|
|
|
build_config.plugin_config.bert_attention_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.bert_context_fmha_fp32_acc
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.context_fmha
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.dora_plugin
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.fp8_rowwise_gemm_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.fuse_fp4_quant
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.gemm_allreduce_plugin
|
Optional[Literal['float16', 'bfloat16', None]]
|
categorical
|
|
float16, bfloat16, None
|
build_config.plugin_config.gemm_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', 'fp8', 'nvfp4', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, fp8, nvfp4, None
|
build_config.plugin_config.gemm_swiglu_plugin
|
Optional[Literal['fp8', None]]
|
categorical
|
|
fp8, None
|
build_config.plugin_config.gpt_attention_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.identity_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.layernorm_quantization_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.lora_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.low_latency_gemm_plugin
|
Optional[Literal['fp8', None]]
|
categorical
|
|
fp8, None
|
build_config.plugin_config.low_latency_gemm_swiglu_plugin
|
Optional[Literal['fp8', None]]
|
categorical
|
|
fp8, None
|
build_config.plugin_config.mamba_conv1d_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.manage_weights
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.moe_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.multiple_profiles
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.nccl_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.norm_quant_fusion
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.paged_kv_cache
|
Optional[bool]
|
value
|
|
|
build_config.plugin_config.paged_state
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.pp_reduce_scatter
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.qserve_gemm_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.quantize_per_token_plugin
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.quantize_tensor_plugin
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.reduce_fusion
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.remove_input_padding
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.rmsnorm_quantization_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.smooth_quant_gemm_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.smooth_quant_plugins
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.streamingllm
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.tokens_per_block
|
<class 'int'>
|
value
|
|
|
build_config.plugin_config.use_fp8_context_fmha
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.use_fused_mlp
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.use_paged_context_fmha
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.user_buffer
|
<class 'bool'>
|
value
|
|
|
build_config.plugin_config.weight_only_groupwise_quant_matmul_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.plugin_config.weight_only_quant_matmul_plugin
|
Optional[Literal['auto', 'float16', 'float32', 'bfloat16', 'int32', None]]
|
categorical
|
|
auto, float16, float32, bfloat16, int32, None
|
build_config.speculative_decoding_mode
|
<flag 'SpeculativeDecodingMode'>
|
categorical
|
|
NONE, DRAFT_TOKENS_EXTERNAL, MEDUSA, LOOKAHEAD_DECODING, EXPLICIT_DRAFT_TOKENS, EAGLE, NGRAM, USER_PROVIDED, SAVE_HIDDEN_STATES, AUTO
|
build_config.strongly_typed
|
<class 'bool'>
|
value
|
|
|
build_config.use_mrope
|
<class 'bool'>
|
value
|
|
|
build_config.use_refit
|
<class 'bool'>
|
value
|
|
|
build_config.use_strip_plan
|
<class 'bool'>
|
value
|
|
|
build_config.weight_sparsity
|
<class 'bool'>
|
value
|
|
|
build_config.weight_streaming
|
<class 'bool'>
|
value
|
|
|
cache_transceiver_config.backend
|
Optional[Literal['DEFAULT', 'UCX', 'NIXL', 'MOONCAKE', 'MPI']]
|
categorical
|
|
DEFAULT, UCX, NIXL, MOONCAKE, MPI
|
cache_transceiver_config.kv_transfer_sender_future_timeout_ms
|
Optional[Annotated[int, Gt(gt=0)]]
|
value
|
|
|
cache_transceiver_config.kv_transfer_timeout_ms
|
Optional[Annotated[int, Gt(gt=0)]]
|
value
|
|
|
cache_transceiver_config.max_tokens_in_buffer
|
Optional[int]
|
value
|
|
|
cache_transceiver_config.transceiver_runtime
|
Optional[Literal['CPP', 'PYTHON']]
|
categorical
|
|
CPP, PYTHON
|
calib_config.calib_batch_size
|
<class 'int'>
|
value
|
|
|
calib_config.calib_batches
|
<class 'int'>
|
value
|
|
|
calib_config.calib_max_seq_length
|
<class 'int'>
|
value
|
|
|
calib_config.device
|
Literal['cuda', 'cpu']
|
categorical
|
|
cuda, cpu
|
calib_config.random_seed
|
<class 'int'>
|
value
|
|
|
calib_config.tokenizer_max_seq_length
|
<class 'int'>
|
value
|
|
|
context_parallel_size
|
<class 'int'>
|
value
|
|
|
cp_config.block_size
|
Optional[int]
|
value
|
|
|
cp_config.cp_anchor_size
|
Optional[int]
|
value
|
|
|
cp_config.cp_type
|
<enum 'CpType'>
|
categorical
|
|
ULYSSES, STAR, RING, HELIX
|
cp_config.fifo_version
|
Optional[int]
|
value
|
|
|
cp_config.tokens_per_block
|
Optional[int]
|
value
|
|
|
cp_config.use_nccl_for_alltoall
|
Optional[bool]
|
value
|
|
|
dtype
|
<class 'str'>
|
categorical
|
allowlist |
auto, float16, bfloat16, float32
|
embedding_parallel_mode
|
Literal['NONE', 'SHARDING_ALONG_VOCAB', 'SHARDING_ALONG_HIDDEN']
|
categorical
|
|
NONE, SHARDING_ALONG_VOCAB, SHARDING_ALONG_HIDDEN
|
enable_attention_dp
|
<class 'bool'>
|
value
|
|
|
enable_build_cache.max_cache_storage_gb
|
<class 'float'>
|
value
|
|
|
enable_build_cache.max_records
|
<class 'int'>
|
value
|
|
|
enable_chunked_prefill
|
<class 'bool'>
|
value
|
|
|
enable_energy_metrics
|
<class 'bool'>
|
value
|
|
|
enable_lm_head_tp_in_adp
|
<class 'bool'>
|
value
|
|
|
enable_lora
|
<class 'bool'>
|
value
|
|
|
enable_prompt_adapter
|
<class 'bool'>
|
value
|
|
|
enable_tqdm
|
<class 'bool'>
|
value
|
|
|
extended_runtime_perf_knob_config.cuda_graph_cache_size
|
<class 'int'>
|
value
|
|
|
extended_runtime_perf_knob_config.cuda_graph_mode
|
<class 'bool'>
|
value
|
|
|
extended_runtime_perf_knob_config.enable_context_fmha_fp32_acc
|
<class 'bool'>
|
value
|
|
|
extended_runtime_perf_knob_config.multi_block_mode
|
<class 'bool'>
|
value
|
|
|
fail_fast_on_attention_window_too_large
|
<class 'bool'>
|
value
|
|
|
fast_build
|
<class 'bool'>
|
value
|
|
|
gather_generation_logits
|
<class 'bool'>
|
value
|
|
|
gpus_per_node
|
Optional[int]
|
value
|
|
|
guided_decoding_backend
|
Optional[Literal['xgrammar', 'llguidance']]
|
categorical
|
|
xgrammar, llguidance
|
iter_stats_max_iterations
|
Optional[int]
|
value
|
|
|
kv_cache_config.attention_dp_events_gather_period_ms
|
<class 'int'>
|
value
|
|
|
kv_cache_config.copy_on_partial_reuse
|
<class 'bool'>
|
value
|
|
|
kv_cache_config.cross_kv_cache_fraction
|
Optional[float]
|
value
|
|
|
kv_cache_config.dtype
|
<class 'str'>
|
categorical
|
allowlist |
auto, float16, bfloat16, float32, fp8, nvfp4
|
kv_cache_config.enable_block_reuse
|
<class 'bool'>
|
value
|
|
|
kv_cache_config.enable_partial_reuse
|
<class 'bool'>
|
value
|
|
|
kv_cache_config.event_buffer_max_size
|
<class 'int'>
|
value
|
|
|
kv_cache_config.free_gpu_memory_fraction
|
Optional[float]
|
value
|
|
|
kv_cache_config.host_cache_size
|
Optional[int]
|
value
|
|
|
kv_cache_config.iteration_stats_interval
|
<class 'int'>
|
value
|
|
|
kv_cache_config.mamba_ssm_cache_dtype
|
Literal['auto', 'float16', 'bfloat16', 'float32']
|
categorical
|
|
auto, float16, bfloat16, float32
|
kv_cache_config.mamba_ssm_philox_rounds
|
<class 'int'>
|
value
|
|
|
kv_cache_config.mamba_ssm_stochastic_rounding
|
<class 'bool'>
|
value
|
|
|
kv_cache_config.mamba_state_cache_interval
|
<class 'int'>
|
value
|
|
|
kv_cache_config.max_attention_window
|
Optional[List[int]]
|
value
|
|
|
kv_cache_config.max_gpu_total_bytes
|
<class 'int'>
|
value
|
|
|
kv_cache_config.max_tokens
|
Optional[int]
|
value
|
|
|
kv_cache_config.max_util_for_resume
|
<class 'float'>
|
value
|
|
|
kv_cache_config.secondary_offload_min_priority
|
Optional[int]
|
value
|
|
|
kv_cache_config.sink_token_length
|
Optional[int]
|
value
|
|
|
kv_cache_config.tokens_per_block
|
<class 'int'>
|
value
|
|
|
kv_cache_config.use_kv_cache_manager_v2
|
<class 'bool'>
|
value
|
|
|
kv_cache_config.use_uvm
|
<class 'bool'>
|
value
|
|
|
load_format
|
Literal['auto', 'dummy']
|
categorical
|
|
auto, dummy
|
lora_config.lora_ckpt_source
|
Literal['hf', 'nemo']
|
categorical
|
|
hf, nemo
|
lora_config.max_cpu_loras
|
Optional[int]
|
value
|
|
|
lora_config.max_lora_rank
|
<class 'int'>
|
value
|
|
|
lora_config.max_loras
|
Optional[int]
|
value
|
|
|
lora_config.swap_gate_up_proj_lora_b_weight
|
<class 'bool'>
|
value
|
|
|
max_batch_size
|
Optional[int]
|
value
|
|
|
max_beam_width
|
Optional[int]
|
value
|
|
|
max_input_len
|
Optional[int]
|
value
|
|
|
max_num_tokens
|
Optional[int]
|
value
|
|
|
max_prompt_adapter_token
|
<class 'int'>
|
value
|
|
|
max_seq_len
|
Optional[int]
|
value
|
|
|
moe_cluster_parallel_size
|
Optional[int]
|
value
|
|
|
moe_expert_parallel_size
|
Optional[int]
|
value
|
|
|
moe_tensor_parallel_size
|
Optional[int]
|
value
|
|
|
normalize_log_probs
|
<class 'bool'>
|
value
|
|
|
num_postprocess_workers
|
<class 'int'>
|
value
|
|
|
orchestrator_type
|
Optional[Literal['rpc', 'ray']]
|
categorical
|
|
rpc, ray
|
peft_cache_config.device_cache_percent
|
<class 'float'>
|
value
|
|
|
peft_cache_config.host_cache_size
|
<class 'int'>
|
value
|
|
|
peft_cache_config.max_adapter_size
|
<class 'int'>
|
value
|
|
|
peft_cache_config.max_pages_per_block_device
|
<class 'int'>
|
value
|
|
|
peft_cache_config.max_pages_per_block_host
|
<class 'int'>
|
value
|
|
|
peft_cache_config.num_copy_streams
|
<class 'int'>
|
value
|
|
|
peft_cache_config.num_device_module_layer
|
<class 'int'>
|
value
|
|
|
peft_cache_config.num_ensure_workers
|
<class 'int'>
|
value
|
|
|
peft_cache_config.num_host_module_layer
|
<class 'int'>
|
value
|
|
|
peft_cache_config.num_put_workers
|
<class 'int'>
|
value
|
|
|
peft_cache_config.optimal_adapter_size
|
<class 'int'>
|
value
|
|
|
perf_metrics_max_requests
|
<class 'int'>
|
value
|
|
|
pipeline_parallel_size
|
<class 'int'>
|
value
|
|
|
pp_partition
|
Optional[List[int]]
|
value
|
|
|
prometheus_metrics_config.e2e_request_latency_buckets
|
Optional[List[float]]
|
value
|
|
|
prometheus_metrics_config.request_decode_time_buckets
|
Optional[List[float]]
|
value
|
|
|
prometheus_metrics_config.request_inference_time_buckets
|
Optional[List[float]]
|
value
|
|
|
prometheus_metrics_config.request_prefill_time_buckets
|
Optional[List[float]]
|
value
|
|
|
prometheus_metrics_config.request_queue_time_buckets
|
Optional[List[float]]
|
value
|
|
|
prometheus_metrics_config.time_per_output_token_buckets
|
Optional[List[float]]
|
value
|
|
|
prometheus_metrics_config.time_to_first_token_buckets
|
Optional[List[float]]
|
value
|
|
|
quant_config.clamp_val
|
Optional[List[float]]
|
value
|
|
|
quant_config.group_size
|
Optional[int]
|
value
|
|
|
quant_config.has_zero_point
|
<class 'bool'>
|
value
|
|
|
quant_config.kv_cache_quant_algo
|
Optional[tensorrt_llm.quantization.mode.QuantAlgo]
|
categorical
|
|
W8A16, W4A16, W4A16_AWQ, W4A8_AWQ, W8A16_GPTQ, W4A16_GPTQ, W8A8_SQ_PER_CHANNEL, W8A8_SQ_PER_TENSOR_PLUGIN, W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN, W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN, W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN, W4A8_QSERVE_PER_GROUP, W4A8_QSERVE_PER_CHANNEL, FP8, FP8_PER_CHANNEL_PER_TOKEN, FP8_BLOCK_SCALES, INT8, MIXED_PRECISION, NVFP4, W4A8_NVFP4_FP8, W4A8_MXFP4_FP8, W4A8_MXFP4_MXFP8, W4A16_MXFP4, NVFP4_AWQ, NVFP4_ARC, NO_QUANT
|
quant_config.mamba_ssm_philox_rounds
|
<class 'int'>
|
value
|
|
|
quant_config.mamba_ssm_stochastic_rounding
|
<class 'bool'>
|
value
|
|
|
quant_config.pre_quant_scale
|
<class 'bool'>
|
value
|
|
|
quant_config.quant_algo
|
Optional[tensorrt_llm.quantization.mode.QuantAlgo]
|
categorical
|
|
W8A16, W4A16, W4A16_AWQ, W4A8_AWQ, W8A16_GPTQ, W4A16_GPTQ, W8A8_SQ_PER_CHANNEL, W8A8_SQ_PER_TENSOR_PLUGIN, W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN, W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN, W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN, W4A8_QSERVE_PER_GROUP, W4A8_QSERVE_PER_CHANNEL, FP8, FP8_PER_CHANNEL_PER_TOKEN, FP8_BLOCK_SCALES, INT8, MIXED_PRECISION, NVFP4, W4A8_NVFP4_FP8, W4A8_MXFP4_FP8, W4A8_MXFP4_MXFP8, W4A16_MXFP4, NVFP4_AWQ, NVFP4_ARC, NO_QUANT
|
quant_config.smoothquant_val
|
<class 'float'>
|
value
|
|
|
quant_config.use_meta_recipe
|
<class 'bool'>
|
value
|
|
|
reasoning_parser
|
Optional[str]
|
categorical
|
allowlist |
auto, deepseek-r1, laguna, qwen3, qwen3_5, minimax_m2, minimax_m2_append_think, nano-v3, gemma4, kimi_k2, kimi_k25
|
request_stats_max_iterations
|
Optional[int]
|
value
|
|
|
return_perf_metrics
|
<class 'bool'>
|
value
|
|
|
scheduler_config.capacity_scheduler_policy
|
<enum 'CapacitySchedulerPolicy'>
|
categorical
|
|
MAX_UTILIZATION, GUARANTEED_NO_EVICT, STATIC_BATCH
|
scheduler_config.context_chunking_policy
|
Optional[tensorrt_llm.llmapi.llm_args.ContextChunkingPolicy]
|
categorical
|
|
FIRST_COME_FIRST_SERVED, EQUAL_PROGRESS, FORCE_CHUNK
|
scheduler_config.dynamic_batch_config.dynamic_batch_moving_average_window
|
<class 'int'>
|
value
|
|
|
scheduler_config.dynamic_batch_config.enable_batch_size_tuning
|
<class 'bool'>
|
value
|
|
|
scheduler_config.dynamic_batch_config.enable_max_num_tokens_tuning
|
<class 'bool'>
|
value
|
|
|
scheduler_config.use_python_scheduler
|
<class 'bool'>
|
value
|
|
|
scheduler_config.waiting_queue_policy
|
<enum 'WaitingQueuePolicy'>
|
categorical
|
|
fcfs, priority
|
skip_tokenizer_init
|
<class 'bool'>
|
value
|
|
|
sparse_attention_config.algorithm
|
Literal['dsa']
|
categorical
|
|
dsa, rocket, skip_softmax
|
sparse_attention_config.enable_heuristic_topk
|
<class 'bool'>
|
value
|
|
|
sparse_attention_config.index_head_dim
|
Optional[int]
|
value
|
|
|
sparse_attention_config.index_n_heads
|
Optional[int]
|
value
|
|
|
sparse_attention_config.index_topk
|
Optional[int]
|
value
|
|
|
sparse_attention_config.indexer_k_dtype
|
Literal['fp8', 'fp4']
|
categorical
|
|
fp8, fp4
|
sparse_attention_config.indexer_max_chunk_size
|
Optional[int]
|
value
|
|
|
sparse_attention_config.indexer_rope_interleave
|
<class 'bool'>
|
value
|
|
|
sparse_attention_config.kernel_size
|
Optional[int]
|
value
|
|
|
sparse_attention_config.kt_cache_dtype
|
Optional[str]
|
categorical
|
allowlist |
bfloat16, float8_e5m2
|
sparse_attention_config.page_size
|
Optional[int]
|
value
|
|
|
sparse_attention_config.prompt_budget
|
Optional[int]
|
value
|
|
|
sparse_attention_config.q_split_threshold
|
<class 'int'>
|
value
|
|
|
sparse_attention_config.seq_len_threshold
|
Optional[int]
|
value
|
|
|
sparse_attention_config.skip_indexer_for_short_seqs
|
<class 'bool'>
|
value
|
|
|
sparse_attention_config.topk
|
Optional[int]
|
value
|
|
|
sparse_attention_config.topr
|
Union[int, float, NoneType]
|
value
|
|
|
sparse_attention_config.use_cute_dsl_paged_mqa_logits
|
<class 'bool'>
|
value
|
|
|
sparse_attention_config.use_cute_dsl_topk
|
<class 'bool'>
|
value
|
|
|
sparse_attention_config.window_size
|
Optional[int]
|
value
|
|
|
speculative_config.acceptance_length_threshold
|
Optional[Annotated[float, Ge(ge=0)]]
|
value
|
|
|
speculative_config.acceptance_window
|
Optional[Annotated[int, Ge(ge=0)]]
|
value
|
|
|
speculative_config.allow_advanced_sampling
|
<class 'bool'>
|
value
|
|
|
speculative_config.begin_thinking_phase_token
|
<class 'int'>
|
value
|
|
|
speculative_config.decoding_type
|
Literal['AUTO']
|
categorical
|
|
AUTO, DFlash, Draft_Target, Eagle3, Eagle, Lookahead, MTP, Medusa, NGram, PARD, SA, SaveState, User_Provided
|
speculative_config.dynamic_tree_max_topK
|
Optional[int]
|
value
|
|
|
speculative_config.eagle3_layers_to_capture
|
Optional[Set[int]]
|
value
|
|
|
speculative_config.eagle3_model_arch
|
Literal['llama3', 'mistral_large3']
|
categorical
|
|
llama3, mistral_large3
|
speculative_config.eagle3_one_model
|
Optional[bool]
|
value
|
|
|
speculative_config.eagle_choices
|
Optional[List[List[int]]]
|
value
|
|
|
speculative_config.enable_global_pool
|
<class 'bool'>
|
value
|
|
|
speculative_config.end_thinking_phase_token
|
<class 'int'>
|
value
|
|
|
speculative_config.global_pool_size
|
Optional[Annotated[int, Gt(gt=0)]]
|
value
|
|
|
speculative_config.greedy_sampling
|
Optional[bool]
|
value
|
|
|
speculative_config.is_keep_all
|
<class 'bool'>
|
value
|
|
|
speculative_config.is_public_pool
|
<class 'bool'>
|
value
|
|
|
speculative_config.is_use_oldest
|
<class 'bool'>
|
value
|
|
|
speculative_config.mask_token_id
|
Optional[int]
|
value
|
|
|
speculative_config.max_concurrency
|
Optional[Annotated[int, Gt(gt=0)]]
|
value
|
|
|
speculative_config.max_draft_len
|
Optional[Annotated[int, Ge(ge=0)]]
|
value
|
|
|
speculative_config.max_matching_ngram_size
|
<class 'int'>
|
value
|
|
|
speculative_config.max_ngram_size
|
<class 'int'>
|
value
|
|
|
speculative_config.max_non_leaves_per_layer
|
Optional[int]
|
value
|
|
|
speculative_config.max_total_draft_tokens
|
Optional[int]
|
value
|
|
|
speculative_config.max_verification_set_size
|
<class 'int'>
|
value
|
|
|
speculative_config.max_window_size
|
<class 'int'>
|
value
|
|
|
speculative_config.medusa_choices
|
Optional[List[List[int]]]
|
value
|
|
|
speculative_config.mtp_eagle_one_model
|
<class 'bool'>
|
value
|
|
|
speculative_config.num_eagle_layers
|
Optional[int]
|
value
|
|
|
speculative_config.num_medusa_heads
|
Optional[int]
|
value
|
|
|
speculative_config.num_nextn_predict_layers
|
Optional[int]
|
value
|
|
|
speculative_config.posterior_threshold
|
Optional[float]
|
value
|
|
|
speculative_config.relaxed_delta
|
<class 'float'>
|
value
|
|
|
speculative_config.relaxed_topk
|
<class 'int'>
|
value
|
|
|
speculative_config.sa_config.enable_global_pool
|
<class 'bool'>
|
value
|
|
|
speculative_config.sa_config.threshold
|
<class 'int'>
|
value
|
|
|
speculative_config.target_layer_ids
|
Optional[List[int]]
|
value
|
|
|
speculative_config.use_dynamic_tree
|
Optional[bool]
|
value
|
|
|
speculative_config.use_mtp_vanilla
|
<class 'bool'>
|
value
|
|
|
speculative_config.use_rejection_sampling
|
<class 'bool'>
|
value
|
|
|
speculative_config.use_relaxed_acceptance_for_thinking
|
<class 'bool'>
|
value
|
|
|
speculative_config.write_interval
|
<class 'int'>
|
value
|
|
|
telemetry_config.disabled
|
<class 'bool'>
|
value
|
|
|
telemetry_config.usage_context
|
<enum 'UsageContext'>
|
categorical
|
|
unknown, llm_class, cli_serve, cli_bench, cli_eval
|
tensor_parallel_size
|
<class 'int'>
|
value
|
|
|
tokenizer_mode
|
Literal['auto', 'slow']
|
categorical
|
|
auto, slow
|
trust_remote_code
|
<class 'bool'>
|
value
|
|
|