Training a Local LLM for Document Retrieval
Build a complete offline RAG system with Python, llama.cpp, embeddings, and FAISS vector searchβno cloud dependency required.
Training a Local LLM for Document Retrieval
I wanted my local LLM to understand my personal documents without cloud API calls. This post covers the complete pipeline: document ingestion, embedding, vector indexing with FAISS, and inference with llama.cpp running entirely offline.
π The Problem: Cloud Dependency and Cost
Most RAG setups require external API calls to proprietary embedding services and LLMs. This means monthly API costs, data leaving your machine, and dependency on third-party availability. A local pipeline solves all three issues while giving you full control over your data.
π οΈ Architecture and Tools
The system has four layers:
[Document Input] β [Chunking & Embedding] β [Vector Index (FAISS)]
β
[Query Embedding]
β
[Retrieval + LLM Generation]
Required packages:
| Component | Tool | Purpose |
|---|---|---|
| LLM Runtime | llama-cpp-python | Run quantized GGUF models |
| Embeddings | sentence-transformers | Convert text to vectors |
| Vector Store | faiss | In-memory/disk-backed indexing |
| Processing | markdown, pdfminer, python-docx | Load various document formats |
Setup
python -m venv .venv
source .venv/bin/activate
pip install llama-cpp-python==0.2.78
pip install sentence-transformers==3.0.1
pip install faiss-cpu==1.8.0
pip install markdown pdfminer.six python-docx
Download a quantized GGUF model (e.g., Mistral-7B-Instruct.Q4_K_M.gguf or Llama-3.1-8B-Instruct.Q4_K_M.gguf) and place it in ./models/.
π Implementation
Step 1: Document Ingestion and Chunking
Load documents consistently across Markdown, PDF, and DOCX formats:
# ingest.py
import os
import re
from pathlib import Path
from typing import List, Dict
from pdfminer.high_level import extract_text as pdf_extract
from docx import Document as DocxDocument
def read_markdown(path: Path) -> str:
text = path.read_text(encoding="utf-8")
text = re.sub(r'<!--.*?-->', '', text, flags=re.S)
return text
def read_pdf(path: Path) -> str:
return pdf_extract(str(path))
def read_docx(path: Path) -> str:
doc = DocxDocument(str(path))
return "\n".join(p.text for p in doc.paragraphs)
def load_corpus(root: Path) -> List[Dict]:
docs = []
for p in root.rglob("*"):
if p.suffix.lower() in [".md", ".mdx"]:
content = read_markdown(p)
docs.append({"text": content, "source": str(p)})
elif p.suffix.lower() == ".pdf":
content = read_pdf(p)
docs.append({"text": content, "source": str(p)})
elif p.suffix.lower() == ".docx":
content = read_docx(p)
docs.append({"text": content, "source": str(p)})
return docs
def chunk_text(text: str, chunk_size=800, chunk_overlap=120) -> List[str]:
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = words[i:i+chunk_size]
chunks.append(" ".join(chunk))
i += (chunk_size - chunk_overlap)
return chunks
def build_documents(root: str) -> List[Dict]:
raw_docs = load_corpus(Path(root))
docs = []
for d in raw_docs:
for c in chunk_text(d["text"]):
docs.append({"text": c, "source": d["source"]})
return docs
if __name__ == "__main__":
docs = build_documents("./my_docs")
print(f"Loaded {len(docs)} chunks")
Use 800 tokens per chunk with 120-token overlap for 7β8B models. Adjust based on your content density.
Step 2: Build Embeddings and Vector Index
Compute embeddings locally and store them in FAISS with metadata:
# index.py
import json
from pathlib import Path
from typing import List, Dict
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
INDEX_DIR = Path("./vector_index")
INDEX_DIR.mkdir(exist_ok=True)
def save_metadata(metadata: List[Dict], path: Path):
path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
def load_metadata(path: Path) -> List[Dict]:
return json.loads(path.read_text(encoding="utf-8"))
def build_index(docs: List[Dict]):
model = SentenceTransformer(EMBEDDING_MODEL)
texts = [d["text"] for d in docs]
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
embeddings = np.array(embeddings, dtype="float32")
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
faiss.write_index(index, str(INDEX_DIR / "faiss.index"))
save_metadata(docs, INDEX_DIR / "meta.json")
print(f"Indexed {len(docs)} chunks")
def load_index():
index = faiss.read_index(str(INDEX_DIR / "faiss.index"))
meta = load_metadata(INDEX_DIR / "meta.json")
return index, meta
def search(query: str, k: int = 5):
index, meta = load_index()
model = SentenceTransformer(EMBEDDING_MODEL)
q = model.encode([query], normalize_embeddings=True)
D, I = index.search(np.array(q, dtype="float32"), k)
results = [{"score": float(D[0][j]), **meta[I[0][j]]} for j in range(len(I[0]))]
return results
if __name__ == "__main__":
from ingest import build_documents
docs = build_documents("./my_docs")
build_index(docs)
print(search("deployment strategies for Astro + Supabase"))
Step 3: LLM Inference with Context
Load a quantized model and generate responses grounded in retrieved documents:
# llm.py
from llama_cpp import Llama
MODEL_PATH = "./models/Mistral-7B-Instruct.Q4_K_M.gguf"
llm = Llama(
model_path=MODEL_PATH,
n_ctx=4096,
n_batch=128,
temperature=0.2,
repeat_penalty=1.1,
)
SYSTEM_PROMPT = (
"You are a helpful assistant. Use only the provided context. "
"If the answer is not in the context, say you don't know."
)
def build_prompt(question: str, contexts: list[str]) -> str:
header = f"<s>[INST] <<SYS>>\n{SYSTEM_PROMPT}\n<</SYS>>\n"
ctx = "\n\n".join([f"[SOURCE] {i+1}\n{c}" for i, c in enumerate(contexts)])
user = f"\n\n[CONTEXT]\n{ctx}\n\n[QUESTION]\n{question} [/INST]"
return header + user
def generate(question: str, contexts: list[str]) -> str:
prompt = build_prompt(question, contexts)
out = llm(
prompt,
max_tokens=768,
stop=["</s>", "[/INST]"],
)
return out["choices"][0]["text"].strip()
Step 4: Wire RAG End-to-End
Connect retrieval and generation:
# app.py
from index import search
from llm import generate
def answer(question: str, k: int = 5) -> dict:
hits = search(question, k=k)
contexts = [h["text"] for h in hits]
text = generate(question, contexts)
return {
"answer": text,
"sources": [{"source": h["source"], "score": round(h["score"], 4)} for h in hits],
"context_preview": [c[:300] for c in contexts],
}
if __name__ == "__main__":
while True:
try:
q = input("\nQ: ").strip()
if not q:
continue
if q.lower() in {"exit", "quit"}:
break
result = answer(q, k=5)
print("\n--- Answer ---\n")
print(result["answer"])
print("\n--- Sources ---\n")
for s in result["sources"]:
print(f"- {s['source']} (score={s['score']})")
except KeyboardInterrupt:
break
π‘ Key Takeaways
- Modular pipeline: Separation of ingestion, embedding, retrieval, and generation makes the system testable and swappable at each layer.
- Quantization works: GGUF Q4 models deliver good quality with minimal compute requirements. Start there before trying larger models.
- Deterministic answers: Grounding LLM output in retrieved documents with explicit source tracking prevents hallucination.
- Tuning knobs: Chunk size, overlap, embedding model, and system prompt all affect response quality. Iterate based on your content type.
- GPU acceleration optional: CPU-only FAISS search and llama.cpp run well on modern hardware. GPU acceleration helps with embedding computation, not required.