Get KV Cache Events#

Source NVIDIA/TensorRT-LLM.

 1### Get KV Cache Events
 2
 3from tensorrt_llm import LLM, SamplingParams
 4from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 5from tensorrt_llm.llmapi import KvCacheConfig
 6
 7
 8def main():
 9    pytorch_config = PyTorchConfig(enable_overlap_scheduler=True,
10                                   autotuner_enabled=False,
11                                   kv_cache_dtype='auto')
12
13    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
14              tensor_parallel_size=2,
15              pytorch_backend_config=pytorch_config,
16              kv_cache_config=KvCacheConfig(enable_block_reuse=True,
17                                            event_buffer_max_size=1024),
18              backend="pytorch")
19
20    # Sample prompts having a common prefix.
21    common_prefix = (
22        "After the ghost's departure, Barnardo notes Horatio's pale appearance and asks if he's okay. "
23        "Horatio concedes that he's shaken and confesses that, without witnessing the ghost himself, he wouldn't have believed it existed. "
24        "He's also disturbed by the ghost's striking resemblance to the king. It even seems to be wearing the former king's armor. "
25        "Horatio thinks the ghost's presence foretells that something is about to go wrong in Denmark. "
26        "Marcellus concurs with Horatio, as he and the other guards have observed that their schedules have become more rigorous and have also noticed the preparations taking place within Elsinore, including the building of cannons, the storing of weapons, and the preparation of ships."
27    )
28    prompts = [
29        common_prefix, common_prefix + " Marcellus also notes that the king's"
30    ]
31
32    # Create a sampling params.
33    sampling_params = SamplingParams(temperature=0.001,
34                                     top_p=0.001,
35                                     max_tokens=5)
36
37    for output in llm.generate(prompts, sampling_params=sampling_params):
38        print(
39            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
40        )
41
42    kv_events = llm.get_kv_cache_events(10)
43    print(kv_events)
44
45    # Got output like follows:
46    # [{'event_id': 0, 'data': {'type': 'created', 'num_blocks_per_cache_level': [101230, 0]}},
47    #  {'event_id': 1, 'data': {'type': 'stored', 'parent_hash': None, 'blocks': [{'type': 'stored_block', 'block_hash': 4203099703668305365, 'tokens': [{'type': 'unique_token', 'token_id': 1, 'token_extra_id': 0}, ...
48
49
50if __name__ == '__main__':
51    main()