Training a Local LLM for Document Retrieval

Training a Local LLM for Document Retrieval

Build a complete offline RAG system with Python, llama.cpp, embeddings, and FAISS vector searchβ€”no cloud dependency required.

Training a Local LLM for Document Retrieval

I wanted my local LLM to understand my personal documents without cloud API calls. This post covers the complete pipeline: document ingestion, embedding, vector indexing with FAISS, and inference with llama.cpp running entirely offline.

πŸ” The Problem: Cloud Dependency and Cost

Most RAG setups require external API calls to proprietary embedding services and LLMs. This means monthly API costs, data leaving your machine, and dependency on third-party availability. A local pipeline solves all three issues while giving you full control over your data.

πŸ› οΈ Architecture and Tools

The system has four layers:

[Document Input] β†’ [Chunking & Embedding] β†’ [Vector Index (FAISS)]
                                                      ↓
                                              [Query Embedding]
                                                      ↓
                                        [Retrieval + LLM Generation]

Required packages:

ComponentToolPurpose
LLM Runtimellama-cpp-pythonRun quantized GGUF models
Embeddingssentence-transformersConvert text to vectors
Vector StorefaissIn-memory/disk-backed indexing
Processingmarkdown, pdfminer, python-docxLoad various document formats

Setup

python -m venv .venv
source .venv/bin/activate

pip install llama-cpp-python==0.2.78
pip install sentence-transformers==3.0.1
pip install faiss-cpu==1.8.0
pip install markdown pdfminer.six python-docx

Download a quantized GGUF model (e.g., Mistral-7B-Instruct.Q4_K_M.gguf or Llama-3.1-8B-Instruct.Q4_K_M.gguf) and place it in ./models/.

πŸ“Š Implementation

Step 1: Document Ingestion and Chunking

Load documents consistently across Markdown, PDF, and DOCX formats:

# ingest.py
import os
import re
from pathlib import Path
from typing import List, Dict

from pdfminer.high_level import extract_text as pdf_extract
from docx import Document as DocxDocument

def read_markdown(path: Path) -> str:
    text = path.read_text(encoding="utf-8")
    text = re.sub(r'<!--.*?-->', '', text, flags=re.S)
    return text

def read_pdf(path: Path) -> str:
    return pdf_extract(str(path))

def read_docx(path: Path) -> str:
    doc = DocxDocument(str(path))
    return "\n".join(p.text for p in doc.paragraphs)

def load_corpus(root: Path) -> List[Dict]:
    docs = []
    for p in root.rglob("*"):
        if p.suffix.lower() in [".md", ".mdx"]:
            content = read_markdown(p)
            docs.append({"text": content, "source": str(p)})
        elif p.suffix.lower() == ".pdf":
            content = read_pdf(p)
            docs.append({"text": content, "source": str(p)})
        elif p.suffix.lower() == ".docx":
            content = read_docx(p)
            docs.append({"text": content, "source": str(p)})
    return docs

def chunk_text(text: str, chunk_size=800, chunk_overlap=120) -> List[str]:
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i+chunk_size]
        chunks.append(" ".join(chunk))
        i += (chunk_size - chunk_overlap)
    return chunks

def build_documents(root: str) -> List[Dict]:
    raw_docs = load_corpus(Path(root))
    docs = []
    for d in raw_docs:
        for c in chunk_text(d["text"]):
            docs.append({"text": c, "source": d["source"]})
    return docs

if __name__ == "__main__":
    docs = build_documents("./my_docs")
    print(f"Loaded {len(docs)} chunks")

Use 800 tokens per chunk with 120-token overlap for 7–8B models. Adjust based on your content density.

Step 2: Build Embeddings and Vector Index

Compute embeddings locally and store them in FAISS with metadata:

# index.py
import json
from pathlib import Path
from typing import List, Dict

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
INDEX_DIR = Path("./vector_index")
INDEX_DIR.mkdir(exist_ok=True)

def save_metadata(metadata: List[Dict], path: Path):
    path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")

def load_metadata(path: Path) -> List[Dict]:
    return json.loads(path.read_text(encoding="utf-8"))

def build_index(docs: List[Dict]):
    model = SentenceTransformer(EMBEDDING_MODEL)
    texts = [d["text"] for d in docs]
    embeddings = model.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
    embeddings = np.array(embeddings, dtype="float32")

    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)

    faiss.write_index(index, str(INDEX_DIR / "faiss.index"))
    save_metadata(docs, INDEX_DIR / "meta.json")
    print(f"Indexed {len(docs)} chunks")

def load_index():
    index = faiss.read_index(str(INDEX_DIR / "faiss.index"))
    meta = load_metadata(INDEX_DIR / "meta.json")
    return index, meta

def search(query: str, k: int = 5):
    index, meta = load_index()
    model = SentenceTransformer(EMBEDDING_MODEL)
    q = model.encode([query], normalize_embeddings=True)
    D, I = index.search(np.array(q, dtype="float32"), k)
    results = [{"score": float(D[0][j]), **meta[I[0][j]]} for j in range(len(I[0]))]
    return results

if __name__ == "__main__":
    from ingest import build_documents
    docs = build_documents("./my_docs")
    build_index(docs)
    print(search("deployment strategies for Astro + Supabase"))

Step 3: LLM Inference with Context

Load a quantized model and generate responses grounded in retrieved documents:

# llm.py
from llama_cpp import Llama

MODEL_PATH = "./models/Mistral-7B-Instruct.Q4_K_M.gguf"

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=4096,
    n_batch=128,
    temperature=0.2,
    repeat_penalty=1.1,
)

SYSTEM_PROMPT = (
    "You are a helpful assistant. Use only the provided context. "
    "If the answer is not in the context, say you don't know."
)

def build_prompt(question: str, contexts: list[str]) -> str:
    header = f"<s>[INST] <<SYS>>\n{SYSTEM_PROMPT}\n<</SYS>>\n"
    ctx = "\n\n".join([f"[SOURCE] {i+1}\n{c}" for i, c in enumerate(contexts)])
    user = f"\n\n[CONTEXT]\n{ctx}\n\n[QUESTION]\n{question} [/INST]"
    return header + user

def generate(question: str, contexts: list[str]) -> str:
    prompt = build_prompt(question, contexts)
    out = llm(
        prompt,
        max_tokens=768,
        stop=["</s>", "[/INST]"],
    )
    return out["choices"][0]["text"].strip()

Step 4: Wire RAG End-to-End

Connect retrieval and generation:

# app.py
from index import search
from llm import generate

def answer(question: str, k: int = 5) -> dict:
    hits = search(question, k=k)
    contexts = [h["text"] for h in hits]
    text = generate(question, contexts)

    return {
        "answer": text,
        "sources": [{"source": h["source"], "score": round(h["score"], 4)} for h in hits],
        "context_preview": [c[:300] for c in contexts],
    }

if __name__ == "__main__":
    while True:
        try:
            q = input("\nQ: ").strip()
            if not q:
                continue
            if q.lower() in {"exit", "quit"}:
                break
            result = answer(q, k=5)
            print("\n--- Answer ---\n")
            print(result["answer"])
            print("\n--- Sources ---\n")
            for s in result["sources"]:
                print(f"- {s['source']} (score={s['score']})")
        except KeyboardInterrupt:
            break

πŸ’‘ Key Takeaways

  1. Modular pipeline: Separation of ingestion, embedding, retrieval, and generation makes the system testable and swappable at each layer.
  2. Quantization works: GGUF Q4 models deliver good quality with minimal compute requirements. Start there before trying larger models.
  3. Deterministic answers: Grounding LLM output in retrieved documents with explicit source tracking prevents hallucination.
  4. Tuning knobs: Chunk size, overlap, embedding model, and system prompt all affect response quality. Iterate based on your content type.
  5. GPU acceleration optional: CPU-only FAISS search and llama.cpp run well on modern hardware. GPU acceleration helps with embedding computation, not required.

πŸ“š Resources