import os
from dotenv import load_dotenv
load_dotenv
USE_GROQ = True # Set to True to send completions to Groq API
MODEL_ID_LOCAL = "TheBloke/Llama-3-8B-Instruct-GGUF" # ~4 GB Q4_K_M
MODEL_FILE = "llama-3-8b-instruct.Q4_K_M.gguf" # filename inside repo
if USE_GROQ:
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")Common notebook that I created to learn advanced rag concepts
LangSmith tracing to me each step from the LLM
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "advanced-rag-notebook"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")New documentation about langchain, to the LLM learn from it
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
urls = [
"https://python.langchain.com/docs/concepts/lcel/",
"https://python.langchain.com/docs/tutorials/rag/"
]
loader = WebBaseLoader(urls)
documents = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
docs_split = splitter.split_documents(documents)print(f"Loaded {len(documents)} pages → {len(docs_split)} chunks")Loaded 2 pages → 90 chunks
Loaded a low embedding model to run locally, also, saving the content into a vectordb
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = Chroma.from_documents(
docs_split, embeddings, collection_name="rag_demo"
)
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 8})Retriever configuration based on embeddings filter
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter
compression_retriever = ContextualCompressionRetriever(
base_compressor=EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.5),
base_retriever=retriever,
)Here I’m using the Groq API, which has a nice free trial. If the call failed, I use a low parameters llama model
from langchain.chat_models import ChatOpenAI
from langchain.llms import LlamaCpp
from pathlib import Path
if USE_GROQ:
llm = ChatOpenAI(
api_key=os.environ["GROQ_API_KEY"],
base_url="https://api.groq.com/openai/v1",
model_name="llama3-8b-8192", # or mixtral-8x7b-32768
temperature=0.2,
max_tokens=1024,
)
else:
model_path = Path.home() / ".cache" / MODEL_FILE
if not model_path.exists():
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id=MODEL_ID_LOCAL, filename=MODEL_FILE)
llm = LlamaCpp(
model_path=str(model_path),
n_gpu_layers=0, # CPU-only; set >0 for GPU offload
n_ctx=8192,
temperature=0.2,
max_tokens=1024,
)Creating the RAG Pipeline:
- user question
- retriever relevant docs
- LLM usage to answer the question using the docs retrieved
from langchain.chains import RetrievalQA
rag_pipeline = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=compression_retriever,
return_source_documents=True,
verbose=True,
)query = "How does LangChain let you compose chains using the Expression Language?"
result = rag_pipeline(query)
print("💬 Answer:\n", result["result"])
print("\n🔗 Sources:")
for doc in result["source_documents"]:
print(" -", doc.metadata["source"])> Entering new RetrievalQA chain... > Finished chain. 💬 Answer: I don't know. 🔗 Sources: - https://python.langchain.com/docs/concepts/lcel/ - https://python.langchain.com/docs/concepts/lcel/ - https://python.langchain.com/docs/concepts/lcel/ - https://python.langchain.com/docs/concepts/lcel/ - https://python.langchain.com/docs/concepts/lcel/ - https://python.langchain.com/docs/concepts/lcel/ - https://python.langchain.com/docs/concepts/lcel/ - https://python.langchain.com/docs/concepts/lcel/