LMCache KV Cache Connector#

Source https://github.com/NVIDIA/TensorRT-LLM/blob/61cef212a8c59e843521881f45eee262c8f0525d/examples/llm-api/llm_lmcache_connector.py.

  1# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2# SPDX-License-Identifier: Apache-2.0
  3#
  4# Licensed under the Apache License, Version 2.0 (the "License");
  5# you may not use this file except in compliance with the License.
  6# You may obtain a copy of the License at
  7#
  8# http://www.apache.org/licenses/LICENSE-2.0
  9#
 10# Unless required by applicable law or agreed to in writing, software
 11# distributed under the License is distributed on an "AS IS" BASIS,
 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13# See the License for the specific language governing permissions and
 14# limitations under the License.
 15"""Demonstrates using LMCache as a KV cache backend for TensorRT-LLM.
 16
 17Uses the KV Cache Connector interface.
 18
 19LMCache stores previously computed KV tensors and replays them on subsequent
 20requests with the same prefix, reducing recomputation.
 21
 22The connector implementation lives in LMCache:
 23  lmcache.integration.tensorrt_llm.tensorrt_adapter
 24
 25TRT-LLM resolves the ``"lmcache"`` preset to the correct import paths
 26automatically via the connector registry.
 27
 28Prerequisites:
 29  pip install lmcache
 30
 31How to run:
 32  PYTHONHASHSEED=0 python llm_lmcache_connector.py Qwen/Qwen2-1.5B-Instruct
 33
 34Note: PYTHONHASHSEED=0 must be set before the Python process starts
 35to ensure deterministic cache key hashing in LMCache.
 36
 37Expected output:
 38  Second request logs show "Retrieved N tokens" and both outputs are identical.
 39
 40See Also:
 41  examples/llm-api/configs/trtllm_lmcache_connector_extra.yaml -- trtllm-serve YAML
 42"""
 43
 44import click
 45
 46from tensorrt_llm import LLM, SamplingParams
 47from tensorrt_llm.llmapi.llm_args import KvCacheConfig, KvCacheConnectorConfig
 48
 49try:
 50    from lmcache.integration.tensorrt_llm import destroy_engine
 51except ImportError as e:
 52    raise ImportError(
 53        "LMCache is not installed or is missing the TensorRT-LLM integration. "
 54        "Run: pip install 'lmcache'"
 55    ) from e
 56
 57# A prompt long enough to produce at least one full TRT-LLM KV block.
 58_TEST_PROMPT = (
 59    "Nvidia Corporation is an American technology company headquartered in "
 60    "Santa Clara, California. Founded in 1993 by Jensen Huang, Chris "
 61    "Malachowsky, and Curtis Priem, it develops graphics processing units "
 62    "(GPUs), system on a chips (SoCs), and application programming "
 63    "interfaces (APIs) for data science, high-performance computing, and "
 64    "mobile and automotive applications. Tell me about the company."
 65)
 66
 67
 68@click.command()
 69@click.argument("model", type=str)
 70def main(model: str):
 71    kv_cache_config = KvCacheConfig(enable_block_reuse=True)
 72    kv_connector_config = KvCacheConnectorConfig(connector="lmcache")
 73    sampling_params = SamplingParams(max_tokens=32)
 74
 75    # Both requests go to the same LLM instance so the in-process LMCache
 76    # engine (and its CPU memory cache) survives between the two calls.
 77    llm = LLM(
 78        model=model,
 79        backend="pytorch",
 80        kv_cache_config=kv_cache_config,
 81        kv_connector_config=kv_connector_config,
 82    )
 83
 84    print("--- First request (cold cache, KV will be computed and stored) ---")
 85    output0 = llm.generate([_TEST_PROMPT], sampling_params)
 86    text0 = output0[0].outputs[0].text
 87    print("First output:", text0)
 88
 89    print("\n--- Second request (warm cache, KV should be retrieved) ---")
 90    output1 = llm.generate([_TEST_PROMPT], sampling_params)
 91    text1 = output1[0].outputs[0].text
 92    print("Second output (using LMCache KV cache):", text1)
 93
 94    assert text0 == text1, (
 95        f"Outputs differ — cache reuse may not have worked correctly.\n"
 96        f"First:  {text0!r}\n"
 97        f"Second: {text1!r}"
 98    )
 99    print("\nOK: outputs match, LMCache KV reuse confirmed.")
100
101    destroy_engine()
102
103
104if __name__ == "__main__":
105    main()