Building a RAG Application
Learn how to build a Retrieval-Augmented Generation (RAG) application using BrilliantAI's Embeddings and Chat APIs.
Complete RAG Example
from openai import OpenAI
import numpy as np
from typing import List, Dict
import json
from dataclasses import dataclass
@dataclass
class Document:
content: str
metadata: Dict
class RAGSystem:
def __init__(self, api_key: str):
self.client = OpenAI(
base_url="https://api.brilliantai.co",
api_key=api_key
)
self.documents: List[Document] = []
self.embeddings: List[List[float]] = []
def add_document(self, content: str, metadata: Dict = None):
"""Add a document to the RAG system"""
# Get embedding for document
response = self.client.embeddings.create(
model="snowflake-arctic-embed-l-v2.0",
input=content
)
embedding = response.data[0].embedding
# Store document and embedding
self.documents.append(Document(content, metadata or {}))
self.embeddings.append(embedding)
def similarity(self, a: List[float], b: List[float]) -> float:
"""Calculate cosine similarity between vectors"""
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def get_relevant_documents(self, query: str, k: int = 3) -> List[Document]:
"""Retrieve k most relevant documents for query"""
# Get query embedding
response = self.client.embeddings.create(
model="snowflake-arctic-embed-l-v2.0",
input=query
)
query_embedding = response.data[0].embedding
# Calculate similarities
similarities = [
self.similarity(query_embedding, doc_embedding)
for doc_embedding in self.embeddings
]
# Get top k documents
top_indices = np.argsort(similarities)[-k:][::-1]
return [self.documents[i] for i in top_indices]
def query(self, question: str) -> str:
"""Query the RAG system"""
# Get relevant documents
relevant_docs = self.get_relevant_documents(question)
# Construct prompt with context
context = "\n\n".join(doc.content for doc in relevant_docs)
messages = [
{
"role": "system",
"content": "You are a helpful assistant. Answer the question based on the provided context."
},
{
"role": "user",
"content": f"Context:\n{context}\n\nQuestion: {question}"
}
]
# Get response from LLM
response = self.client.chat.completions.create(
model="mistral-nemo",
messages=messages,
temperature=0.7
)
return response.choices[0].message.content
# Usage Example
rag = RAGSystem("your-api-key")
# Add some documents
rag.add_document(
"BrilliantAI provides access to state-of-the-art AI models through a simple API.",
{"source": "about.txt"}
)
rag.add_document(
"Our flagship model mistral-nemo excels at general-purpose text generation.",
{"source": "models.txt"}
)
rag.add_document(
"Embeddings are used for semantic search and RAG applications.",
{"source": "embeddings.txt"}
)
# Query the system
question = "What models does BrilliantAI offer?"
answer = rag.query(question)
print(answer)
FastAPI Web Service
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
app = FastAPI()
rag = RAGSystem("your-api-key")
class Document(BaseModel):
content: str
metadata: Optional[dict] = None
class Query(BaseModel):
question: str
@app.post("/documents")
async def add_document(document: Document):
try:
rag.add_document(document.content, document.metadata)
return {"status": "success"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/query")
async def query(query: Query):
try:
answer = rag.query(query.question)
return {"answer": answer}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
Vector Database Integration
Using Pinecone
import pinecone
from uuid import uuid4
class PineconeRAG:
def __init__(self, api_key: str, pinecone_key: str, index_name: str):
self.client = OpenAI(
base_url="https://api.brilliantai.co",
api_key=api_key
)
# Initialize Pinecone
pinecone.init(api_key=pinecone_key)
self.index = pinecone.Index(index_name)
def add_document(self, content: str, metadata: Dict = None):
"""Add document to Pinecone"""
# Get embedding
response = self.client.embeddings.create(
model="snowflake-arctic-embed-l-v2.0",
input=content
)
vector = response.data[0].embedding
# Store in Pinecone
self.index.upsert([
(str(uuid4()), vector, {
"content": content,
**(metadata or {})
})
])
def query(self, question: str) -> str:
"""Query using Pinecone"""
# Get question embedding
response = self.client.embeddings.create(
model="snowflake-arctic-embed-l-v2.0",
input=question
)
vector = response.data[0].embedding
# Query Pinecone
results = self.index.query(
vector=vector,
top_k=3,
include_metadata=True
)
# Get relevant contexts
contexts = [
match.metadata["content"]
for match in results.matches
]
# Query LLM
messages = [
{
"role": "system",
"content": "Answer based on the context provided."
},
{
"role": "user",
"content": f"Context:\n{contexts}\n\nQuestion: {question}"
}
]
response = self.client.chat.completions.create(
model="mistral-nemo",
messages=messages
)
return response.choices[0].message.content
Best Practices
-
Document Processing
- Split long documents into chunks
- Maintain context in chunks
- Remove irrelevant content
- Clean and normalize text
-
Embedding Strategy
- Choose appropriate chunk size
- Use semantic similarity
- Consider metadata filtering
- Cache common embeddings
-
Query Processing
- Implement proper error handling
- Use appropriate temperature
- Monitor token usage
- Consider response streaming
Next Steps
- Learn about Embeddings API
- Try chat applications
- Explore LLM models