Building a fully offline, local LLM that talks to my documents

I wanted my local LLM to actually understand what I’ve written—docs, notes, blog drafts—without any cloud dependency. Here’s the exact setup I used, end to end, with Python tooling, a local model, and a retrieval pipeline that runs entirely offline.

Overview of the stack

Model runtime: llama-cpp-python (fast, quantized GGUF models)
Embedding model: sentence-transformers (local, CPU/GPU)
Vector store: faiss (in-memory or disk-backed index)
RAG orchestration: Lightweight custom pipeline or langchain for convenience
Document processing: markdown + pdfminer.six + python-docx (optional)
Caching: Disk-based embeddings to avoid recompute

If you prefer transformers instead of llama-cpp-python, you can swap the model runner while keeping the retrieval pieces identical.

Setup and environment

Create and prepare the environment

# Create environment
python -m venv .venv
source .venv/bin/activate

# Core packages
pip install llama-cpp-python==0.2.78
pip install sentence-transformers==3.0.1
pip install faiss-cpu==1.8.0
pip install langchain==0.3.0 langchain-community==0.3.0
pip install markdown pdfminer.six python-docx pypdf

Model file: Download a GGUF model (e.g., Mistral-7B-Instruct.Q4_K_M.gguf or Llama-3.1-8B-Instruct.Q4_K_M.gguf) and place it in ./models/.
Hardware notes: Q4_K_M is a good balance of quality and speed on CPU. If you have a decent GPU, llama.cpp can leverage it via CUDA build; otherwise CPU is fine.

Document ingestion and chunking

We need consistent chunking, metadata, and clean text extraction across Markdown/PDF/Docx. I keep it simple and deterministic.

# ingest.py
import os
import re
from pathlib import Path
from typing import List, Dict

from pdfminer.high_level import extract_text as pdf_extract
from docx import Document as DocxDocument

def read_markdown(path: Path) -> str:
    text = path.read_text(encoding="utf-8")
    # strip HTML comments or custom markers if needed
    text = re.sub(r'<!--.*?-->', '', text, flags=re.S)
    return text

def read_pdf(path: Path) -> str:
    return pdf_extract(str(path))

def read_docx(path: Path) -> str:
    doc = DocxDocument(str(path))
    return "\n".join(p.text for p in doc.paragraphs)

def load_corpus(root: Path) -> List[Dict]:
    docs = []
    for p in root.rglob("*"):
        if p.suffix.lower() in [".md", ".mdx"]:
            content = read_markdown(p)
            docs.append({"text": content, "source": str(p)})
        elif p.suffix.lower() == ".pdf":
            content = read_pdf(p)
            docs.append({"text": content, "source": str(p)})
        elif p.suffix.lower() in [".docx"]:
            content = read_docx(p)
            docs.append({"text": content, "source": str(p)})
    return docs

def chunk_text(text: str, chunk_size=800, chunk_overlap=120) -> List[str]:
    # simple word-based chunking
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i+chunk_size]
        chunks.append(" ".join(chunk))
        i += (chunk_size - chunk_overlap)
    return chunks

def build_documents(root: str) -> List[Dict]:
    raw_docs = load_corpus(Path(root))
    docs = []
    for d in raw_docs:
        for c in chunk_text(d["text"]):
            docs.append({"text": c, "source": d["source"]})
    return docs

if __name__ == "__main__":
    docs = build_documents("./my_docs")
    print(f"Loaded {len(docs)} chunks")

Chunking: 800 tokens with 120 overlap works well for 7–8B models; tune for your content density.
Metadata: source path helps with grounded answers and citations in responses.

Embeddings and FAISS index

Compute embeddings locally and store a FAISS index plus a sidecar JSON for metadata. This keeps the setup simple and reproducible.

# index.py
import json
import os
from pathlib import Path
from typing import List, Dict

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
INDEX_DIR = Path("./vector_index")
INDEX_DIR.mkdir(exist_ok=True)

def save_metadata(metadata: List[Dict], path: Path):
    path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")

def load_metadata(path: Path) -> List[Dict]:
    return json.loads(path.read_text(encoding="utf-8"))

def build_index(docs: List[Dict]):
    model = SentenceTransformer(EMBEDDING_MODEL)
    texts = [d["text"] for d in docs]
    embeddings = model.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
    embeddings = np.array(embeddings, dtype="float32")

    index = faiss.IndexFlatIP(embeddings.shape[1])  # cosine via normalized vectors
    index.add(embeddings)

    faiss.write_index(index, str(INDEX_DIR / "faiss.index"))
    save_metadata(docs, INDEX_DIR / "meta.json")
    print(f"Indexed {len(docs)} chunks into {str(INDEX_DIR)}")

def load_index():
    index = faiss.read_index(str(INDEX_DIR / "faiss.index"))
    meta = load_metadata(INDEX_DIR / "meta.json")
    return index, meta

def search(query: str, k: int = 5):
    index, meta = load_index()
    model = SentenceTransformer(EMBEDDING_MODEL)
    q = model.encode([query], normalize_embeddings=True)
    D, I = index.search(np.array(q, dtype="float32"), k)
    results = [{"score": float(D[0][j]), **meta[I[0][j]]} for j in range(len(I[0]))]
    return results

if __name__ == "__main__":
    from ingest import build_documents
    docs = build_documents("./my_docs")
    build_index(docs)
    print(search("deployment strategies for Astro + Supabase"))

Cosine similarity: Achieved via Inner Product on normalized embeddings (fast and simple).
Caching: Re-run indexing only when documents change.

Running the local LLM with llama.cpp

Use llama-cpp-python to load a quantized GGUF model. Keep prompts short and include retrieved context.

# llm.py
from llama_cpp import Llama

MODEL_PATH = "./models/Mistral-7B-Instruct.Q4_K_M.gguf"

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=4096,         # adjust based on model + memory
    n_batch=128,        # larger for throughput on CPU
    temperature=0.2,
    repeat_penalty=1.1,
)

SYSTEM_PROMPT = (
    "You are a helpful assistant. Use only the provided context. "
    "If the answer is not in the context, say you don't know."
)

def build_prompt(question: str, contexts: list[str]) -> str:
    header = f"<s>[INST] <<SYS>>\n{SYSTEM_PROMPT}\n<</SYS>>\n"
    ctx = "\n\n".join([f"[SOURCE] {i+1}\n{c}" for i, c in enumerate(contexts)])
    user = f"\n\n[CONTEXT]\n{ctx}\n\n[QUESTION]\n{question} [/INST]"
    return header + user

def generate(question: str, contexts: list[str]) -> str:
    prompt = build_prompt(question, contexts)
    out = llm(
        prompt,
        max_tokens=768,
        stop=["</s>", "[/INST]"],
    )
    return out["choices"][0]["text"].strip()

Context window: Set n_ctx to 4096 if your model supports it; match your GGUF file’s recommended params.
Instruction format: For Mistral/Llama Instruct, [INST] style is robust. Adapt if you use different models.

Wiring up retrieval-augmented generation (RAG)

Tie retrieval to LLM generation, include source snippets, and return grounded answers.

# app.py
from typing import List
from index import search
from llm import generate

def answer(question: str, k: int = 5) -> dict:
    hits = search(question, k=k)
    contexts = [h["text"] for h in hits]
    text = generate(question, contexts)

    return {
        "answer": text,
        "sources": [{"source": h["source"], "score": round(h["score"], 4)} for h in hits],
        "context_preview": [c[:300] for c in contexts],
    }

if __name__ == "__main__":
    while True:
        try:
            q = input("\nQ: ").strip()
            if not q:
                continue
            if q.lower() in {"exit", "quit"}:
                break
            result = answer(q, k=5)
            print("\n--- Answer ---\n")
            print(result["answer"])
            print("\n--- Sources ---\n")
            for s in result["sources"]:
                print(f"- {s['source']} (score={s['score']})")
        except KeyboardInterrupt:
            break

Grounding: Show source paths and scores so you know exactly where content came from.
Fallback: If retrieval scores are low, prompt the LLM to say “I don’t know” rather than hallucinating.

Optional enhancements

Better chunking and metadata

Semantic chunking: Use headings and semantic boundaries for Markdown (e.g., split by #, ## before word-based chunking).
Citation stitching: Return line numbers or section titles for clearer references in answers.

Speed and quality

GPU acceleration: Build llama-cpp-python with CUDA or Metal for significant speedups.
Larger embeddings: Try all-mpnet-base-v2 if you want higher retrieval quality, at the cost of speed.

LangChain integration (if you want convenience)

# rag_langchain.py
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import LlamaCpp
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

embedding = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)

# Assume `docs` is a list of {"text": "...", "source": "..."}
texts = [d["text"] for d in docs]
splits = text_splitter.create_documents(texts, metadatas=[{"source": d["source"]} for d in docs])
vectorstore = FAISS.from_documents(splits, embedding)

llm = LlamaCpp(
    model_path="./models/Mistral-7B-Instruct.Q4_K_M.gguf",
    n_ctx=4096,
    temperature=0.2,
)

template = """You are a helpful assistant. Use ONLY the context below.
{context}
Question: {question}
If not in context, say you don't know."""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt},
)

Trade-off: LangChain speeds up scaffolding but introduces abstraction; the custom approach is leaner and transparent.

What made this work well for me

Privacy-first: Everything runs offline; no requests leave my machine.
Modular pipeline: Ingestion → embeddings → FAISS → llama.cpp → prompt. Each layer is swappable.
Deterministic answers: Grounded in my own documents, with explicit sources and a conservative prompt.
Performance knobs: Quantized GGUF models + cosine similarity on normalized embeddings = fast retrieval, responsive answers.

Quick start checklist

Collect: Put all docs in ./my_docs (Markdown, PDFs, Docx).
Index: Run python index.py after python ingest.py to build chunks and FAISS.
Model: Download a GGUF instruct model to ./models.
Chat: Run python app.py and start asking questions about your content.
Iterate: Tune chunk size, k, and the system prompt based on responses.

If you want, share the types of docs you’re indexing and your hardware setup. I can suggest specific models, quantizations, and chunking strategies tailored to your workflow.

Table of Contents

Training a Local LLM