Press Release Chat Bot
As part of this generative AI workflow, we create a NVIDIA PR chatbot that answers questions from the NVIDIA news and blogs from years of 2022 and 2023. For this, we have created a REST FastAPI server that wraps llama-index. The API server has two methods, upload_document
and generate
. The upload_document
method takes a document from the user’s computer and uploads it to a Milvus vector database after splitting, chunking and embedding the document. The generate
API method generates an answer from the provided prompt optionally sourcing information from a vector database.
Step-1: Load the pdf files from the dataset folder.
You can upload the pdf files containing the NVIDIA blogs to query:8081/uploadDocument
API endpoint
%%capture
!unzip dataset.zip
import os
import requests
import mimetypes
def upload_document(file_path, url):
headers = {
'accept': 'application/json'
}
mime_type, _ = mimetypes.guess_type(file_path)
files = {
'file': (file_path, open(file_path, 'rb'), mime_type)
}
response = requests.post(url, headers=headers, files=files)
return response.text
def upload_pdf_files(folder_path, upload_url, num_files):
i = 0
for files in os.listdir(folder_path):
_, ext = os.path.splitext(files)
# Ingest only pdf files
if ext.lower() == ".pdf":
file_path = os.path.join(folder_path, files)
print(upload_document(file_path, upload_url))
i += 1
if i > num_files:
break
import time
start_time = time.time()
NUM_DOCS_TO_UPLOAD=100
upload_pdf_files("dataset", "http://chain-server:8081/documents", NUM_DOCS_TO_UPLOAD)
print(f"--- {time.time() - start_time} seconds ---")
Step-2 : Ask a question without referring to the knowledge base
Ask Tensorrt LLM llama-2 13B model a question about “the nvidia grace superchip” without seeking help from the vectordb/knowledge base by setting use_knowledge_base
to false
import time
import json
data = {
"messages": [
{
"role": "user",
"content": "how many cores are on the nvidia grace superchip?"
}
],
"use_knowledge_base": "false",
"max_tokens": 256
}
url = "http://chain-server:8081/generate"
start_time = time.time()
with requests.post(url, stream=True, json=data) as req:
for chunk in req.iter_lines():
raw_resp = chunk.decode("UTF-8")
if not raw_resp:
continue
resp_dict = json.loads(raw_resp[6:])
resp_choices = resp_dict.get("choices", [])
if len(resp_choices):
resp_str = resp_choices[0].get("message", {}).get("content", "")
print(resp_str, end ="")
print(f"--- {time.time() - start_time} seconds ---")
Now ask it the same question by setting use_knowledge_base
to true
data = {
"messages": [
{
"role": "user",
"content": "how many cores are on the nvidia grace superchip?"
}
],
"use_knowledge_base": "true",
"max_tokens": 50
}
url = "http://chain-server:8081/generate"
start_time = time.time()
tokens_generated = 0
with requests.post(url, stream=True, json=data) as req:
for chunk in req.iter_lines():
raw_resp = chunk.decode("UTF-8")
if not raw_resp:
continue
resp_dict = json.loads(raw_resp[6:])
resp_choices = resp_dict.get("choices", [])
if len(resp_choices):
resp_str = resp_choices[0].get("message", {}).get("content", "")
print(resp_str, end ="")
total_time = time.time() - start_time
print(f"\n--- Generated {tokens_generated} tokens in {total_time} seconds ---")
print(f"--- {tokens_generated/total_time} tokens/sec")
Next steps
We have setup a playground UI for you to upload files and get answers from, the UI is available on the same IP address as the notebooks: host_ip:8090/converse