Basics: Prompt, Client, and Responses
from triton_trt_llm import HttpTritonClient
Step 1: Structure the Query in a Prompt Template
NEMOTRON_PROMPT_TEMPLATE = (
"""<extra_id_0>System
{system}
<extra_id_1>User
{prompt}
<extra_id_1>Assistant
"""
)
system = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature."
prompt = 'What is the fastest land animal?'
prompt = NEMOTRON_PROMPT_TEMPLATE.format(prompt=prompt, system=system)
Step 2: Create the Triton Client
triton_url = "llm:8000"
client = HttpTritonClient(triton_url)
pload = {
'prompt':[[prompt]],
'tokens':64,
'temperature':1.0,
'top_k':1,
'top_p':0,
'beam_width':1,
'repetition_penalty':1.0,
'length_penalty':1.0
}
Step 3: Load the Model and Generate Response
model_name = "ensemble"
client.load_model(model_name)
val = client.request(model_name, **pload)
print(val)