compile_backend: torch-cudagraph
max_batch_size: 64
max_seq_len: 4096
enable_chunked_prefill: true
cuda_graph_config:
  batch_sizes: [1, 2, 4, 8, 16, 32, 64]
transforms:
  match_swiglu_pattern:
    enabled: true
  match_nvfp4_swiglu_pattern:
    enabled: true
  fuse_nvfp4_moe:
    allow_different_input_scales: true
  fuse_nvfp4_swiglu:
    enabled: true
  fuse_swiglu:
    enabled: true
  multi_stream_moe:
    stage: compile
    enabled: true
  multi_stream_mla_attn:
    stage: compile
    enabled: true
