Press Release Chat Bot

As part of this generative AI workflow, we create a NVIDIA PR chatbot that answers questions from the NVIDIA news and blogs from years of 2022 and 2023. For this, we have created a REST FastAPI server that wraps llama-index. The API server has two methods, upload_document and generate. The upload_document method takes a document from the user’s computer and uploads it to a Milvus vector database after splitting, chunking and embedding the document. The generate API method generates an answer from the provided prompt optionally sourcing information from a vector database.

Step-1: Load the pdf files from the dataset folder.

You can upload the pdf files containing the NVIDIA blogs to query:8081/uploadDocument API endpoint

%%capture
!unzip dataset.zip
import os
import requests
import mimetypes

def upload_document(file_path, url):
    headers = {
        'accept': 'application/json'
    }
    mime_type, _ = mimetypes.guess_type(file_path)
    files = {
        'file': (file_path, open(file_path, 'rb'), mime_type)
    }
    response = requests.post(url, headers=headers, files=files)

    return response.text

def upload_pdf_files(folder_path, upload_url, num_files):
    i = 0
    for files in os.listdir(folder_path):
        _, ext = os.path.splitext(files)
        # Ingest only pdf files
        if ext.lower() == ".pdf":
            file_path = os.path.join(folder_path, files)
            print(upload_document(file_path, upload_url))
            i += 1
            if i > num_files:
                break
import time

start_time = time.time()
NUM_DOCS_TO_UPLOAD=100
upload_pdf_files("dataset", "http://chain-server:8081/documents", NUM_DOCS_TO_UPLOAD)
print(f"--- {time.time() - start_time} seconds ---")

Step-2 : Ask a question without referring to the knowledge base

Ask Tensorrt LLM llama-2 13B model a question about “the nvidia grace superchip” without seeking help from the vectordb/knowledge base by setting use_knowledge_base to false

import time
import json

data = {
 "messages": [
    {
      "role": "user",
      "content": "how many cores are on the nvidia grace superchip?"
    }
  ],
  "use_knowledge_base": "false",
  "max_tokens": 256
}

url = "http://chain-server:8081/generate"

start_time = time.time()
with requests.post(url, stream=True, json=data) as req:
    for chunk in req.iter_lines():
        raw_resp = chunk.decode("UTF-8")
        if not raw_resp:
            continue
        resp_dict = json.loads(raw_resp[6:])
        resp_choices = resp_dict.get("choices", [])
        if len(resp_choices):
            resp_str = resp_choices[0].get("message", {}).get("content", "")
            print(resp_str, end ="")

print(f"--- {time.time() - start_time} seconds ---")

Now ask it the same question by setting use_knowledge_base to true

data = {
 "messages": [
    {
      "role": "user",
      "content": "how many cores are on the nvidia grace superchip?"
    }
  ],
  "use_knowledge_base": "true",
  "max_tokens": 50
}

url = "http://chain-server:8081/generate"

start_time = time.time()
tokens_generated = 0
with requests.post(url, stream=True, json=data) as req:
    for chunk in req.iter_lines():
        raw_resp = chunk.decode("UTF-8")
        if not raw_resp:
            continue
        resp_dict = json.loads(raw_resp[6:])
        resp_choices = resp_dict.get("choices", [])
        if len(resp_choices):
            resp_str = resp_choices[0].get("message", {}).get("content", "")
            print(resp_str, end ="")

total_time = time.time() - start_time
print(f"\n--- Generated {tokens_generated} tokens in {total_time} seconds ---")
print(f"--- {tokens_generated/total_time} tokens/sec")

Next steps

We have setup a playground UI for you to upload files and get answers from, the UI is available on the same IP address as the notebooks: host_ip:8090/converse