Basics: Prompt, Client, and Responses

from triton_trt_llm import HttpTritonClient

Step 1: Structure the Query in a Prompt Template

NEMOTRON_PROMPT_TEMPLATE = (
 """<extra_id_0>System
{system}
<extra_id_1>User
{prompt}
<extra_id_1>Assistant
"""
)
system = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature."
prompt = 'What is the fastest land animal?'
prompt = NEMOTRON_PROMPT_TEMPLATE.format(prompt=prompt, system=system)

Step 2: Create the Triton Client

triton_url = "llm:8000"
client = HttpTritonClient(triton_url)

pload = {
            'prompt':[[prompt]],
            'tokens':64,
            'temperature':1.0,
            'top_k':1,
            'top_p':0,
            'beam_width':1,
            'repetition_penalty':1.0,
            'length_penalty':1.0
}

Step 3: Load the Model and Generate Response

model_name = "ensemble"
client.load_model(model_name)
val = client.request(model_name, **pload)
print(val)