Prometheus Metrics#
Refer to the trtllm-serve documentation for starting a server.
1
2# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17from urllib.request import urlopen
18
19from openai import OpenAI
20
21# Initialize the OpenAI client
22client = OpenAI(
23 base_url="http://localhost:8000/v1",
24 api_key="tensorrt_llm",
25)
26
27# Prometheus metric prefix used by TensorRT-LLM
28METRIC_PREFIX = "trtllm_"
29
30# Base URL for the metrics endpoint
31METRICS_URL = "http://localhost:8000/prometheus/metrics"
32
33
34def fetch_metrics() -> str | None:
35 """Fetch raw Prometheus exposition text from the metrics endpoint.
36
37 Returns:
38 The decoded response body as a string, or None if the request
39 failed or returned a non-200 status.
40 """
41 try:
42 response = urlopen(METRICS_URL)
43 if response.status == 200:
44 return response.read().decode("utf-8")
45 else:
46 print(f"Error fetching metrics: HTTP {response.status}")
47 return None
48 except Exception as e:
49 print(f"Error fetching metrics: {e}")
50 return None
51
52
53def parse_and_display_metrics(metrics_data: str) -> None:
54 """Parse Prometheus exposition text and print TensorRT-LLM metrics.
55
56 Searches the raw text for a predefined set of metrics (request counts,
57 latency histograms, KV cache stats). Found metrics are printed with
58 their sample lines; missing metrics are listed separately.
59
60 Args:
61 metrics_data: Raw Prometheus exposition text returned by fetch_metrics().
62 """
63 if not metrics_data:
64 return
65
66 print("\n" + "=" * 80)
67 print("TensorRT-LLM Prometheus Metrics")
68 print("=" * 80)
69
70 # Define metrics to display with descriptions
71 metrics_of_interest = {
72 f"{METRIC_PREFIX}request_success_total": "Total successful requests",
73 f"{METRIC_PREFIX}e2e_request_latency_seconds": "End-to-end request latency",
74 f"{METRIC_PREFIX}time_to_first_token_seconds": "Time to first token",
75 f"{METRIC_PREFIX}request_queue_time_seconds": "Request queue time",
76 f"{METRIC_PREFIX}kv_cache_hit_rate": "KV cache hit rate",
77 f"{METRIC_PREFIX}kv_cache_reused_blocks_total": "KV cache reused blocks (cumulative)",
78 f"{METRIC_PREFIX}kv_cache_missed_blocks_total": "KV cache missed blocks (cumulative)",
79 f"{METRIC_PREFIX}kv_cache_utilization": "KV cache utilization",
80 }
81
82 found_metrics = []
83 missing_metrics = []
84
85 for metric_name, description in metrics_of_interest.items():
86 if metric_name in metrics_data:
87 found_metrics.append((metric_name, description))
88 else:
89 missing_metrics.append((metric_name, description))
90
91 # Display found metrics
92 if found_metrics:
93 print("\n✓ Available Metrics:")
94 print("-" * 80)
95 for metric_name, description in found_metrics:
96 # Extract the metric lines from the data
97 lines = [
98 line
99 for line in metrics_data.split("\n")
100 if line.startswith(metric_name) and not line.startswith("#")
101 ]
102 print(f"\n{description} ({metric_name}):")
103 for line in lines:
104 print(f" {line}")
105
106 # Display missing metrics
107 if missing_metrics:
108 print("\n✗ Not Yet Available:")
109 print("-" * 80)
110 for metric_name, description in missing_metrics:
111 print(f" {description} ({metric_name})")
112
113 print("\n" + "=" * 80)
114
115
116def main():
117 """Send completion requests to a running TensorRT-LLM server and display Prometheus metrics.
118
119 Sends 10 completion requests sequentially, fetching and printing
120 the Prometheus metrics after each response to show how counters, histograms,
121 and gauges evolve over time.
122 """
123 print("Prometheus Metrics Example")
124 print("=" * 80)
125 print("This script will:")
126 print("1. Send 10 completion requests to a running TensorRT-LLM server")
127 print(
128 "2. After each response, fetch and display Prometheus metrics from the /prometheus/metrics endpoint"
129 )
130 print()
131
132 # Make several completion requests to generate metrics
133 print("Sending completion requests...")
134 NUM_REQUESTS = 10
135 for i in range(NUM_REQUESTS):
136 try:
137 response = client.completions.create(
138 model="Server",
139 prompt=(
140 f"Hello, this is request {i + 1}. "
141 "Use your greatest imagination in this request. Tell me a lot about"
142 ),
143 max_tokens=1000,
144 stream=False,
145 )
146 print(
147 f" Request {i + 1}/{NUM_REQUESTS} completed. Response: {response.choices[0].text[:50]}..."
148 )
149
150 # Fetch and display metrics after each response
151 print(f"\n Fetching metrics after request {i + 1}...")
152 metrics_data = fetch_metrics()
153 if metrics_data:
154 parse_and_display_metrics(metrics_data)
155 else:
156 print(" ✗ Failed to fetch metrics")
157 print()
158 except Exception as e:
159 print(f" Error on request {i + 1}: {e}")
160 print("All requests completed.")
161
162
163if __name__ == "__main__":
164 main()