OpenAI Completion Client with JSON Schema#

Refer to the trtllm-serve documentation for starting a server.

Source NVIDIA/TensorRT-LLM.

 1
 2# This example requires to specify `guided_decoding_backend` as
 3# `xgrammar` or `llguidance` in the extra_llm_api_options.yaml file.
 4import json
 5
 6from openai import OpenAI
 7
 8client = OpenAI(
 9    base_url="http://localhost:8000/v1",
10    api_key="tensorrt_llm",
11)
12
13response = client.chat.completions.create(
14    model="TinyLlama-1.1B-Chat-v1.0",
15    messages=[{
16        "role": "system",
17        "content": "you are a helpful assistant"
18    }, {
19        "role":
20        "user",
21        "content":
22        f"Give me the information of the biggest city of China in the JSON format.",
23    }],
24    temperature=0,
25    response_format={
26        "type": "json",
27        "schema": {
28            "type": "object",
29            "properties": {
30                "name": {
31                    "type": "string"
32                },
33                "population": {
34                    "type": "integer"
35                },
36            },
37            "required": ["name", "population"],
38            "chat_template_kwargs": {
39                "enable_thinking": False
40            }
41        }
42    },
43)
44
45content = response.choices[0].message.content
46try:
47    response_json = json.loads(content)
48    assert "name" in response_json and "population" in response_json
49    print(content)
50except json.JSONDecodeError:
51    print("Failed to decode JSON response")