Prometheus Metrics#
Refer to the trtllm-serve documentation for starting a server.
Source NVIDIA/TensorRT-LLM.
1
2# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17from urllib.request import urlopen
18
19from openai import OpenAI
20
21# Initialize the OpenAI client
22client = OpenAI(
23 base_url="http://localhost:8000/v1",
24 api_key="tensorrt_llm",
25)
26
27# Prometheus metric prefix used by TensorRT-LLM
28METRIC_PREFIX = "trtllm_"
29
30# Base URL for the metrics endpoint
31METRICS_URL = "http://localhost:8000/prometheus/metrics"
32
33
34def fetch_metrics() -> dict | None:
35 """Fetch metrics from the Prometheus endpoint."""
36 try:
37 response = urlopen(METRICS_URL)
38 if response.status == 200:
39 return response.read().decode("utf-8")
40 else:
41 print(f"Error fetching metrics: HTTP {response.status}")
42 return None
43 except Exception as e:
44 print(f"Error fetching metrics: {e}")
45 return None
46
47
48def parse_and_display_metrics(metrics_data: dict) -> None:
49 """Parse and display relevant TensorRT-LLM metrics."""
50 if not metrics_data:
51 return
52
53 print("\n" + "=" * 80)
54 print("TensorRT-LLM Prometheus Metrics")
55 print("=" * 80)
56
57 # Define metrics to display with descriptions
58 metrics_of_interest = {
59 f"{METRIC_PREFIX}request_success_total": "Total successful requests",
60 f"{METRIC_PREFIX}e2e_request_latency_seconds": "End-to-end request latency",
61 f"{METRIC_PREFIX}time_to_first_token_seconds": "Time to first token",
62 f"{METRIC_PREFIX}request_queue_time_seconds": "Request queue time",
63 f"{METRIC_PREFIX}kv_cache_hit_rate": "KV cache hit rate",
64 f"{METRIC_PREFIX}kv_cache_utilization": "KV cache utilization",
65 }
66
67 found_metrics = []
68 missing_metrics = []
69
70 for metric_name, description in metrics_of_interest.items():
71 if metric_name in metrics_data:
72 found_metrics.append((metric_name, description))
73 else:
74 missing_metrics.append((metric_name, description))
75
76 # Display found metrics
77 if found_metrics:
78 print("\n✓ Available Metrics:")
79 print("-" * 80)
80 for metric_name, description in found_metrics:
81 # Extract the metric lines from the data
82 lines = [
83 line
84 for line in metrics_data.split("\n")
85 if line.startswith(metric_name) and not line.startswith("#")
86 ]
87 print(f"\n{description} ({metric_name}):")
88 for line in lines:
89 print(f" {line}")
90
91 # Display missing metrics
92 if missing_metrics:
93 print("\n✗ Not Yet Available:")
94 print("-" * 80)
95 for metric_name, description in missing_metrics:
96 print(f" {description} ({metric_name})")
97
98 print("\n" + "=" * 80)
99
100
101def main():
102 print("Prometheus Metrics Example")
103 print("=" * 80)
104 print("This script will:")
105 print("1. Send several completion requests to a running TensorRT-LLM server")
106 print(
107 "2. After each response, fetch and display Prometheus metrics from the /prometheus/metrics endpoint"
108 )
109 print()
110
111 # Make several completion requests to generate metrics
112 print("Sending completion requests...")
113 num_requests = 10
114 for i in range(num_requests):
115 try:
116 response = client.completions.create(
117 model="Server",
118 prompt=(
119 f"Hello, this is request {i + 1}. "
120 "Use your greatest imagination in this request. Tell me a lot about"
121 ),
122 max_tokens=1000,
123 stream=False,
124 )
125 print(
126 f" Request {i + 1}/{num_requests} completed. Response: {response.choices[0].text[:50]}..."
127 )
128
129 # Fetch and display metrics after each response
130 print(f"\n Fetching metrics after request {i + 1}...")
131 metrics_data = fetch_metrics()
132 if metrics_data:
133 parse_and_display_metrics(metrics_data)
134 else:
135 print(" ✗ Failed to fetch metrics")
136 print()
137 except Exception as e:
138 print(f" Error on request {i + 1}: {e}")
139 print("All requests completed.")
140
141
142if __name__ == "__main__":
143 main()