Llm Mgmn Llm Distributed#
Source NVIDIA/TensorRT-LLM.
1#!/bin/bash
2#SBATCH -A <account> # parameter
3#SBATCH -p <partition> # parameter
4#SBATCH -t 01:00:00
5#SBATCH -N 1
6#SBATCH --ntasks-per-node=2
7#SBATCH -o logs/llmapi-distributed.out
8#SBATCH -e logs/llmapi-distributed.err
9#SBATCH -J llmapi-distributed-task
10
11### Run LLM-API with pytorch backend on Slurm
12
13# NOTE, this feature is experimental and may not work on all systems.
14# The trtllm-llmapi-launch is a script that launches the LLM-API code on
15# Slurm-like systems, and can support multi-node and multi-GPU setups.
16
17# Note that, the number of MPI processes should be the same as the model world
18# size. e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for
19# each, or 4 nodes with 4 gpus for each or other combinations.
20
21# This docker image should have tensorrt_llm installed, or you need to install
22# it in the task.
23
24# The following variables are expected to be set in the environment:
25# You can set them via --export in the srun/sbatch command.
26# CONTAINER_IMAGE: the docker image to use, you'd better install tensorrt_llm in it, or install it in the task.
27# MOUNT_DIR: the directory to mount in the container
28# MOUNT_DEST: the destination directory in the container
29# WORKDIR: the working directory in the container
30# SOURCE_ROOT: the path to the TensorRT-LLM source
31# PROLOGUE: the prologue to run before the script
32# LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
33# not supported in Slurm mode, you need to download the model and put it in
34# the LOCAL_MODEL directory.
35
36# Adjust the paths to run
37export script=$SOURCE_ROOT/examples/pytorch/quickstart_advanced.py
38
39# Just launch the PyTorch example with trtllm-llmapi-launch command.
40srun -l \
41 --container-image=${CONTAINER_IMAGE} \
42 --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
43 --container-workdir=${WORKDIR} \
44 --export=ALL \
45 --mpi=pmix \
46 bash -c "
47 $PROLOGUE
48 export PATH=$PATH:~/.local/bin
49 trtllm-llmapi-launch python3 $script \
50 --model_dir $LOCAL_MODEL \
51 --prompt 'Hello, how are you?' \
52 --tp_size 2
53 "