NVIDIA AI Endpoints with LangChain
In this notebook, we are going to use the mixtral_8x7b as LLM as well as the nvolveqa_40k embedding provided by NVIDIA_AI_Endpoint and build a simply RAG example with faiss as vectorstore
Prerequisite
In order to successfully run this notebook, you will need the following -
Already successfully gone through the setup and generated an API key.
install necesary python dependencies in requirements.txt : then upgrade the langchain-core with the below
pip install langchain-core==0.1.15
Note: change faiss-gpu –> faiss-cpu in pre-requisite 2 if you do not have access to a GPU.
Step 1 - Export the NVIDIA_API_KEY
You can supply the NVIDIA_API_KEY directly in this notebook when you run the cell below
!pip install langchain-core==0.1.15
!pip install faiss-cpu # replace with faiss-gpu if you are using GPU
import getpass
import os
## API Key can be found by going to NVIDIA NGC -> AI Foundation Models -> (some model) -> Get API Code or similar.
## 10K free queries to any endpoint (which is a lot actually).
# del os.environ['NVIDIA_API_KEY'] ## delete key and reset
if os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
print("Valid NVIDIA_API_KEY already in environment. Delete to reset")
else:
nvapi_key = getpass.getpass("NVAPI Key (starts with nvapi-): ")
assert nvapi_key.startswith("nvapi-"), f"{nvapi_key[:5]}... is not a valid key"
os.environ["NVIDIA_API_KEY"] = nvapi_key
Step 2 - initialize the LLM
Here we will use mixtral_8x7b
# test run and see that you can genreate a respond successfully
from langchain_nvidia_ai_endpoints import ChatNVIDIA
llm = ChatNVIDIA(model="mixtral_8x7b", nvidia_api_key=nvapi_key)
result = llm.invoke("Write a ballad about LangChain.")
print(result.content)
Step 3 - We intiatlize the embedding as well
We selected nvolveqa_40k as the embedding
first we initialize the embedding
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
embedder = NVIDIAEmbeddings(model="nvolveqa_40k")
# Alternatively, if you want to specify whether it will use the query or passage type
# embedder = NVIDIAEmbeddings(model="nvolveqa_40k", model_type="passage")
Step 4 - Obtain some toy text dataset
import os
from tqdm import tqdm
from pathlib import Path
# Here we read in the text data and prepare them into vectorstore
ps = os.listdir("./toy_data/")
data = []
sources = []
for p in ps:
if p.endswith('.txt'):
path2file="./toy_data/"+p
with open(path2file,encoding="utf-8") as f:
lines=f.readlines()
for line in lines:
if len(line)>=1:
data.append(line)
sources.append(path2file)
Step 5 - Do some basic cleaning and remove empty lines
documents=[d for d in data if d is not '\n']
len(data), len(documents), data[0]
Step 6a (optional) - Speed test: check how fast ( in seconds) processing 1 document vs. a batch of 10 documents
import time
print("Single Document Embedding: ")
s = time.perf_counter()
q_embedding = embedder.embed_documents([documents[0]])
elapsed = time.perf_counter() - s
print("\033[1m" + f"Executed in {elapsed:0.2f} seconds." + "\033[0m")
print("Shape:", (len(q_embedding),))
print("\nBatch Document Embedding: ")
s = time.perf_counter()
d_embeddings = embedder.embed_documents(documents[:10])
elapsed = time.perf_counter() - s
print("\033[1m" + f"Executed in {elapsed:0.2f} seconds." + "\033[0m")
print("Shape:",len(d_embeddings[0]))
Step 6b - Process the documents into faiss vectorstore and save it to disk
# Here we create a vector store from the documents and save it to disk.
import faiss
from operator import itemgetter
from langchain.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.text_splitter import CharacterTextSplitter
from langchain_nvidia_ai_endpoints import ChatNVIDIA
import faiss
import pickle
# create my own uuid
text_splitter = CharacterTextSplitter(chunk_size=400, separator=" ")
docs = []
metadatas = []
for i, d in enumerate(documents):
splits = text_splitter.split_text(d)
#print(len(splits))
docs.extend(splits)
metadatas.extend([{"source": sources[i]}] * len(splits))
store = FAISS.from_texts(docs, embedder , metadatas=metadatas)
faiss.write_index(store.index, "./toy_data/nv_embedding.index")
store.index = None
with open("./toy_data/nv_embedding.pkl", "wb") as f:
pickle.dump(store, f)
# you will only need to do this once, later on we will restore the already saved vectorstore
Step 6c - Read the previously processed & saved Faiss vectore store back
# Load the vectorestore back.
import faiss
import pickle
index = faiss.read_index("./toy_data/nv_embedding.index")
with open("./toy_data/nv_embedding.pkl", "rb") as f:
store = pickle.load(f)
store.index = index
Step 7- Wrap the restored vectorsore into a retriever and ask our question
retriever = store.as_retriever()
prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"Answer solely based on the following context:\n<Documents>\n{context}\n</Documents>",
),
("user", "{question}"),
]
)
model = ChatNVIDIA(model="mixtral_8x7b")
chain = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| model
| StrOutputParser()
)
chain.invoke("Tell me about Sweden.")