Prometheus Metrics#

Refer to the trtllm-serve documentation for starting a server.

Source https://github.com/NVIDIA/TensorRT-LLM/blob/61cef212a8c59e843521881f45eee262c8f0525d/examples/serve/prometheus_metrics.py.

  1
  2# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3# SPDX-License-Identifier: Apache-2.0
  4#
  5# Licensed under the Apache License, Version 2.0 (the "License");
  6# you may not use this file except in compliance with the License.
  7# You may obtain a copy of the License at
  8#
  9# http://www.apache.org/licenses/LICENSE-2.0
 10#
 11# Unless required by applicable law or agreed to in writing, software
 12# distributed under the License is distributed on an "AS IS" BASIS,
 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14# See the License for the specific language governing permissions and
 15# limitations under the License.
 16
 17from urllib.request import urlopen
 18
 19from openai import OpenAI
 20
 21# Initialize the OpenAI client
 22client = OpenAI(
 23    base_url="http://localhost:8000/v1",
 24    api_key="tensorrt_llm",
 25)
 26
 27# Prometheus metric prefix used by TensorRT-LLM
 28METRIC_PREFIX = "trtllm_"
 29
 30# Base URL for the metrics endpoint
 31METRICS_URL = "http://localhost:8000/prometheus/metrics"
 32
 33
 34def fetch_metrics() -> str | None:
 35    """Fetch raw Prometheus exposition text from the metrics endpoint.
 36
 37    Returns:
 38        The decoded response body as a string, or None if the request
 39        failed or returned a non-200 status.
 40    """
 41    try:
 42        response = urlopen(METRICS_URL)
 43        if response.status == 200:
 44            return response.read().decode("utf-8")
 45        else:
 46            print(f"Error fetching metrics: HTTP {response.status}")
 47            return None
 48    except Exception as e:
 49        print(f"Error fetching metrics: {e}")
 50        return None
 51
 52
 53def parse_and_display_metrics(metrics_data: str) -> None:
 54    """Parse Prometheus exposition text and print TensorRT-LLM metrics.
 55
 56    Searches the raw text for a predefined set of metrics (request counts,
 57    latency histograms, KV cache stats). Found metrics are printed with
 58    their sample lines; missing metrics are listed separately.
 59
 60    Args:
 61        metrics_data: Raw Prometheus exposition text returned by fetch_metrics().
 62    """
 63    if not metrics_data:
 64        return
 65
 66    print("\n" + "=" * 80)
 67    print("TensorRT-LLM Prometheus Metrics")
 68    print("=" * 80)
 69
 70    # Define metrics to display with descriptions
 71    metrics_of_interest = {
 72        f"{METRIC_PREFIX}request_success_total": "Total successful requests",
 73        f"{METRIC_PREFIX}e2e_request_latency_seconds": "End-to-end request latency",
 74        f"{METRIC_PREFIX}time_to_first_token_seconds": "Time to first token",
 75        f"{METRIC_PREFIX}request_queue_time_seconds": "Request queue time",
 76        f"{METRIC_PREFIX}kv_cache_hit_rate": "KV cache hit rate",
 77        f"{METRIC_PREFIX}kv_cache_reused_blocks_total": "KV cache reused blocks (cumulative)",
 78        f"{METRIC_PREFIX}kv_cache_missed_blocks_total": "KV cache missed blocks (cumulative)",
 79        f"{METRIC_PREFIX}kv_cache_utilization": "KV cache utilization",
 80    }
 81
 82    found_metrics = []
 83    missing_metrics = []
 84
 85    for metric_name, description in metrics_of_interest.items():
 86        if metric_name in metrics_data:
 87            found_metrics.append((metric_name, description))
 88        else:
 89            missing_metrics.append((metric_name, description))
 90
 91    # Display found metrics
 92    if found_metrics:
 93        print("\n✓ Available Metrics:")
 94        print("-" * 80)
 95        for metric_name, description in found_metrics:
 96            # Extract the metric lines from the data
 97            lines = [
 98                line
 99                for line in metrics_data.split("\n")
100                if line.startswith(metric_name) and not line.startswith("#")
101            ]
102            print(f"\n{description} ({metric_name}):")
103            for line in lines:
104                print(f"  {line}")
105
106    # Display missing metrics
107    if missing_metrics:
108        print("\n✗ Not Yet Available:")
109        print("-" * 80)
110        for metric_name, description in missing_metrics:
111            print(f"  {description} ({metric_name})")
112
113    print("\n" + "=" * 80)
114
115
116def main():
117    """Send completion requests to a running TensorRT-LLM server and display Prometheus metrics.
118
119    Sends 10 completion requests sequentially, fetching and printing
120    the Prometheus metrics after each response to show how counters, histograms,
121    and gauges evolve over time.
122    """
123    print("Prometheus Metrics Example")
124    print("=" * 80)
125    print("This script will:")
126    print("1. Send 10 completion requests to a running TensorRT-LLM server")
127    print(
128        "2. After each response, fetch and display Prometheus metrics from the /prometheus/metrics endpoint"
129    )
130    print()
131
132    # Make several completion requests to generate metrics
133    print("Sending completion requests...")
134    NUM_REQUESTS = 10
135    for i in range(NUM_REQUESTS):
136        try:
137            response = client.completions.create(
138                model="Server",
139                prompt=(
140                    f"Hello, this is request {i + 1}. "
141                    "Use your greatest imagination in this request. Tell me a lot about"
142                ),
143                max_tokens=1000,
144                stream=False,
145            )
146            print(
147                f"  Request {i + 1}/{NUM_REQUESTS} completed. Response: {response.choices[0].text[:50]}..."
148            )
149
150            # Fetch and display metrics after each response
151            print(f"\n  Fetching metrics after request {i + 1}...")
152            metrics_data = fetch_metrics()
153            if metrics_data:
154                parse_and_display_metrics(metrics_data)
155            else:
156                print("  ✗ Failed to fetch metrics")
157            print()
158        except Exception as e:
159            print(f"  Error on request {i + 1}: {e}")
160    print("All requests completed.")
161
162
163if __name__ == "__main__":
164    main()