1# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2# SPDX-License-Identifier: Apache-2.0
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Demonstrates using LMCache as a KV cache backend for TensorRT-LLM.
16
17Uses the KV Cache Connector interface.
18
19LMCache stores previously computed KV tensors and replays them on subsequent
20requests with the same prefix, reducing recomputation.
21
22The connector implementation lives in LMCache:
23 lmcache.integration.tensorrt_llm.tensorrt_adapter
24
25TRT-LLM resolves the ``"lmcache"`` preset to the correct import paths
26automatically via the connector registry.
27
28Prerequisites:
29 pip install lmcache
30
31How to run:
32 PYTHONHASHSEED=0 python llm_lmcache_connector.py Qwen/Qwen2-1.5B-Instruct
33
34Note: PYTHONHASHSEED=0 must be set before the Python process starts
35to ensure deterministic cache key hashing in LMCache.
36
37Expected output:
38 Second request logs show "Retrieved N tokens" and both outputs are identical.
39
40See Also:
41 examples/llm-api/configs/trtllm_lmcache_connector_extra.yaml -- trtllm-serve YAML
42"""
43
44import click
45
46from tensorrt_llm import LLM, SamplingParams
47from tensorrt_llm.llmapi.llm_args import KvCacheConfig, KvCacheConnectorConfig
48
49try:
50 from lmcache.integration.tensorrt_llm import destroy_engine
51except ImportError as e:
52 raise ImportError(
53 "LMCache is not installed or is missing the TensorRT-LLM integration. "
54 "Run: pip install 'lmcache'"
55 ) from e
56
57# A prompt long enough to produce at least one full TRT-LLM KV block.
58_TEST_PROMPT = (
59 "Nvidia Corporation is an American technology company headquartered in "
60 "Santa Clara, California. Founded in 1993 by Jensen Huang, Chris "
61 "Malachowsky, and Curtis Priem, it develops graphics processing units "
62 "(GPUs), system on a chips (SoCs), and application programming "
63 "interfaces (APIs) for data science, high-performance computing, and "
64 "mobile and automotive applications. Tell me about the company."
65)
66
67
68@click.command()
69@click.argument("model", type=str)
70def main(model: str):
71 kv_cache_config = KvCacheConfig(enable_block_reuse=True)
72 kv_connector_config = KvCacheConnectorConfig(connector="lmcache")
73 sampling_params = SamplingParams(max_tokens=32)
74
75 # Both requests go to the same LLM instance so the in-process LMCache
76 # engine (and its CPU memory cache) survives between the two calls.
77 llm = LLM(
78 model=model,
79 backend="pytorch",
80 kv_cache_config=kv_cache_config,
81 kv_connector_config=kv_connector_config,
82 )
83
84 print("--- First request (cold cache, KV will be computed and stored) ---")
85 output0 = llm.generate([_TEST_PROMPT], sampling_params)
86 text0 = output0[0].outputs[0].text
87 print("First output:", text0)
88
89 print("\n--- Second request (warm cache, KV should be retrieved) ---")
90 output1 = llm.generate([_TEST_PROMPT], sampling_params)
91 text1 = output1[0].outputs[0].text
92 print("Second output (using LMCache KV cache):", text1)
93
94 assert text0 == text1, (
95 f"Outputs differ — cache reuse may not have worked correctly.\n"
96 f"First: {text0!r}\n"
97 f"Second: {text1!r}"
98 )
99 print("\nOK: outputs match, LMCache KV reuse confirmed.")
100
101 destroy_engine()
102
103
104if __name__ == "__main__":
105 main()