OpenAI Chat Client for Multimodal#
Refer to the trtllm-serve documentation for starting a server.
Source NVIDIA/TensorRT-LLM.
1
2import os
3from pathlib import Path
4
5from openai import OpenAI
6from PIL import Image
7
8from tensorrt_llm.inputs import (encode_base64_content_from_url,
9 encode_base64_image)
10
11client = OpenAI(
12 base_url="http://localhost:8000/v1",
13 api_key="tensorrt_llm",
14)
15
16llm_models_root = Path(os.environ.get("LLM_MODELS_ROOT"))
17
18if llm_models_root is not None:
19 multimodal_test_data_path = llm_models_root / "multimodals" / "test_data"
20 image_url1 = str(multimodal_test_data_path / "seashore.png")
21 image_url2 = str(multimodal_test_data_path / "inpaint.png")
22 video_url = str(multimodal_test_data_path / "OAI-sora-tokyo-walk.mp4")
23 image64 = encode_base64_image(
24 Image.open(multimodal_test_data_path / "seashore.png"))
25else:
26 image_url1 = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"
27 image_url2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"
28 video_url = "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/OAI-sora-tokyo-walk.mp4"
29 image64 = encode_base64_content_from_url(
30 "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"
31 )
32
33# SINGLE IMAGE INFERENCE
34response = client.chat.completions.create(
35 model="Qwen2.5-VL-3B-Instruct",
36 messages=[{
37 "role": "system",
38 "content": "you are a helpful assistant"
39 }, {
40 "role":
41 "user",
42 "content": [{
43 "type": "text",
44 "text": "Describe the natural environment in the image."
45 }, {
46 "type": "image_url",
47 "image_url": {
48 "url": image_url1
49 }
50 }]
51 }],
52 max_tokens=64,
53)
54print(response)
55
56# MULTI IMAGE INFERENCE
57response = client.chat.completions.create(
58 model="Qwen2.5-VL-3B-Instruct",
59 messages=[{
60 "role": "system",
61 "content": "you are a helpful assistant"
62 }, {
63 "role":
64 "user",
65 "content": [{
66 "type": "text",
67 "text": "Tell me the difference between two images"
68 }, {
69 "type": "image_url",
70 "image_url": {
71 "url": image_url2
72 }
73 }, {
74 "type": "image_url",
75 "image_url": {
76 "url": image_url1
77 }
78 }]
79 }],
80 max_tokens=64,
81)
82print(response)
83
84# SINGLE VIDEO INFERENCE
85response = client.chat.completions.create(
86 model="Qwen2.5-VL-3B-Instruct",
87 messages=[{
88 "role": "system",
89 "content": "you are a helpful assistant"
90 }, {
91 "role":
92 "user",
93 "content": [{
94 "type": "text",
95 "text": "Tell me what you see in the video briefly."
96 }, {
97 "type": "video_url",
98 "video_url": {
99 "url": video_url
100 }
101 }]
102 }],
103 max_tokens=64,
104)
105print(response)
106
107# IMAGE EMBED INFERENCE
108response = client.chat.completions.create(
109 model="Qwen2.5-VL-3B-Instruct",
110 messages=[{
111 "role": "system",
112 "content": "you are a helpful assistant"
113 }, {
114 "role":
115 "user",
116 "content": [{
117 "type": "text",
118 "text": "Describe the natural environment in the image."
119 }, {
120 "type": "image_url",
121 "image_url": {
122 "url": "data:image/png;base64," + image64
123 }
124 }]
125 }],
126 max_tokens=64,
127)
128print(response)