Run trtllm-bench with pytorch backend on Slurm#
Source NVIDIA/TensorRT-LLM.
1#!/bin/bash
2#SBATCH -A <account>
3#SBATCH -p <partition>
4#SBATCH -t 01:00:00
5#SBATCH -N 2
6#SBATCH --ntasks-per-node=8
7#SBATCH -o logs/trtllm-bench.out
8#SBATCH -e logs/trtllm-bench.err
9#SBATCH -J trtllm-bench
10
11##############################################################################
12# OVERVIEW:
13# This script runs trtllm-bench throughput benchmarking on SLURM with multi-node,
14# multi-GPU setup. It prepares a synthetic dataset and then benchmarks the model
15# using the PyTorch backend with tensor parallelism.
16#
17# WHAT TO MODIFY:
18# 1. SLURM Parameters (lines 2-9):
19# - Replace <account> with your SLURM account name
20# - Replace <partition> with your SLURM partition name
21# - Adjust -N (number of nodes) based on your TP size
22# - Adjust --ntasks-per-node (GPUs per node) to match your setup
23#
24# 2. Environment Variables (set before running sbatch):
25# - CONTAINER_IMAGE: Docker image with TensorRT-LLM installed
26# - MOUNT_DIR: Host directory to mount in container
27# - MOUNT_DEST: Container mount destination path
28# - WORKDIR: Working directory inside container
29# - SOURCE_ROOT: Path to TensorRT-LLM source code
30# - PROLOGUE: Commands to run before main task (e.g., module loads)
31# - LOCAL_MODEL: Path to your pre-downloaded model directory
32# - MODEL_NAME: Name of the model to benchmark
33# - EXTRA_ARGS: (Optional) Additional benchmark arguments
34#
35# 3. Model Configuration (lines 87-94):
36# - --tp 16: Adjust tensor parallelism size to match your node/GPU setup
37# - --num-requests (line 56): Change number of benchmark requests
38# - --input-mean/output-mean (lines 57-58): Adjust token lengths
39#
40# EXAMPLE USAGE:
41# export CONTAINER_IMAGE="nvcr.io/nvidia/tensorrt_llm:latest"
42# export LOCAL_MODEL="/path/to/llama-model"
43# export MODEL_NAME="meta-llama/Llama-2-7b-hf"
44# sbatch llm_mgmn_trtllm_bench.sh
45##############################################################################
46
47
48# NOTE, this feature is experimental and may not work on all systems.
49# The trtllm-llmapi-launch is a script that launches the LLM-API code on
50# Slurm-like systems, and can support multi-node and multi-GPU setups.
51
52# IMPORTANT: Total MPI processes (nodes × ntasks-per-node) must equal tensor_parallel_size.
53# e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for
54# each, or 4 nodes with 4 gpus for each or other combinations.
55
56# This docker image should have tensorrt_llm installed, or you need to install
57# it in the task.
58
59# The following variables are expected to be set in the environment:
60# You can set them via --export in the srun/sbatch command.
61# CONTAINER_IMAGE: the docker image to use, you'd better install tensorrt_llm in it, or install it in the task.
62# MOUNT_DIR: the directory to mount in the container
63# MOUNT_DEST: the destination directory in the container
64# WORKDIR: the working directory in the container
65# SOURCE_ROOT: the path to the TensorRT LLM source
66# PROLOGUE: the prologue to run before the script
67# LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
68# not supported in Slurm mode, you need to download the model and put it in
69# the LOCAL_MODEL directory.
70
71export prepare_dataset="$SOURCE_ROOT/benchmarks/cpp/prepare_dataset.py"
72export data_path="$WORKDIR/token-norm-dist.txt"
73
74echo "Preparing dataset..."
75srun -l \
76 -N 1 \
77 -n 1 \
78 --container-image=${CONTAINER_IMAGE} \
79 --container-name="prepare-name" \
80 --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
81 --container-workdir=${WORKDIR} \
82 --export=ALL \
83 --mpi=pmix \
84 bash -c "
85 $PROLOGUE
86 python3 $prepare_dataset \
87 --tokenizer=$LOCAL_MODEL \
88 --stdout token-norm-dist \
89 --num-requests=100 \
90 --input-mean=128 \
91 --output-mean=128 \
92 --input-stdev=0 \
93 --output-stdev=0 > $data_path
94 "
95
96echo "Running benchmark..."
97# Just launch trtllm-bench job with trtllm-llmapi-launch command.
98
99srun -l \
100 --container-image=${CONTAINER_IMAGE} \
101 --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
102 --container-workdir=${WORKDIR} \
103 --export=ALL,PYTHONPATH=${SOURCE_ROOT} \
104 --mpi=pmix \
105 bash -c "
106 set -ex
107 $PROLOGUE
108 export PATH=$PATH:~/.local/bin
109
110 # This is optional
111 cat > /tmp/pytorch_extra_args.txt << EOF
112cuda_graph_config: null
113print_iter_log: true
114enable_attention_dp: false
115EOF
116
117 # launch the benchmark
118 trtllm-llmapi-launch \
119 trtllm-bench \
120 --model $MODEL_NAME \
121 --model_path $LOCAL_MODEL \
122 throughput \
123 --dataset $data_path \
124 --backend pytorch \
125 --tp 16 \
126 --extra_llm_api_options /tmp/pytorch_extra_args.txt \
127 $EXTRA_ARGS
128 "