Building a fully offline, local LLM that talks to my documents
I wanted my local LLM to actually understand what I’ve written—docs, notes, blog drafts—without any cloud dependency. Here’s the exact setup I used, end to end, with Python tooling, a local model, and a retrieval pipeline that runs entirely offline.
Overview of the stack
- Model runtime:
llama-cpp-python(fast, quantized GGUF models) - Embedding model:
sentence-transformers(local, CPU/GPU) - Vector store:
faiss(in-memory or disk-backed index) - RAG orchestration: Lightweight custom pipeline or
langchainfor convenience - Document processing:
markdown+pdfminer.six+python-docx(optional) - Caching: Disk-based embeddings to avoid recompute
If you prefer
transformersinstead ofllama-cpp-python, you can swap the model runner while keeping the retrieval pieces identical.
Setup and environment
Create and prepare the environment
# Create environment
python -m venv .venv
source .venv/bin/activate
# Core packages
pip install llama-cpp-python==0.2.78
pip install sentence-transformers==3.0.1
pip install faiss-cpu==1.8.0
pip install langchain==0.3.0 langchain-community==0.3.0
pip install markdown pdfminer.six python-docx pypdf
- Model file: Download a GGUF model (e.g.,
Mistral-7B-Instruct.Q4_K_M.gguforLlama-3.1-8B-Instruct.Q4_K_M.gguf) and place it in./models/. - Hardware notes: Q4_K_M is a good balance of quality and speed on CPU. If you have a decent GPU, llama.cpp can leverage it via CUDA build; otherwise CPU is fine.
Document ingestion and chunking
We need consistent chunking, metadata, and clean text extraction across Markdown/PDF/Docx. I keep it simple and deterministic.
# ingest.py
import os
import re
from pathlib import Path
from typing import List, Dict
from pdfminer.high_level import extract_text as pdf_extract
from docx import Document as DocxDocument
def read_markdown(path: Path) -> str:
text = path.read_text(encoding="utf-8")
# strip HTML comments or custom markers if needed
text = re.sub(r'<!--.*?-->', '', text, flags=re.S)
return text
def read_pdf(path: Path) -> str:
return pdf_extract(str(path))
def read_docx(path: Path) -> str:
doc = DocxDocument(str(path))
return "\n".join(p.text for p in doc.paragraphs)
def load_corpus(root: Path) -> List[Dict]:
docs = []
for p in root.rglob("*"):
if p.suffix.lower() in [".md", ".mdx"]:
content = read_markdown(p)
docs.append({"text": content, "source": str(p)})
elif p.suffix.lower() == ".pdf":
content = read_pdf(p)
docs.append({"text": content, "source": str(p)})
elif p.suffix.lower() in [".docx"]:
content = read_docx(p)
docs.append({"text": content, "source": str(p)})
return docs
def chunk_text(text: str, chunk_size=800, chunk_overlap=120) -> List[str]:
# simple word-based chunking
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = words[i:i+chunk_size]
chunks.append(" ".join(chunk))
i += (chunk_size - chunk_overlap)
return chunks
def build_documents(root: str) -> List[Dict]:
raw_docs = load_corpus(Path(root))
docs = []
for d in raw_docs:
for c in chunk_text(d["text"]):
docs.append({"text": c, "source": d["source"]})
return docs
if __name__ == "__main__":
docs = build_documents("./my_docs")
print(f"Loaded {len(docs)} chunks")
- Chunking: 800 tokens with 120 overlap works well for 7–8B models; tune for your content density.
- Metadata: source path helps with grounded answers and citations in responses.
Embeddings and FAISS index
Compute embeddings locally and store a FAISS index plus a sidecar JSON for metadata. This keeps the setup simple and reproducible.
# index.py
import json
import os
from pathlib import Path
from typing import List, Dict
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
INDEX_DIR = Path("./vector_index")
INDEX_DIR.mkdir(exist_ok=True)
def save_metadata(metadata: List[Dict], path: Path):
path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
def load_metadata(path: Path) -> List[Dict]:
return json.loads(path.read_text(encoding="utf-8"))
def build_index(docs: List[Dict]):
model = SentenceTransformer(EMBEDDING_MODEL)
texts = [d["text"] for d in docs]
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
embeddings = np.array(embeddings, dtype="float32")
index = faiss.IndexFlatIP(embeddings.shape[1]) # cosine via normalized vectors
index.add(embeddings)
faiss.write_index(index, str(INDEX_DIR / "faiss.index"))
save_metadata(docs, INDEX_DIR / "meta.json")
print(f"Indexed {len(docs)} chunks into {str(INDEX_DIR)}")
def load_index():
index = faiss.read_index(str(INDEX_DIR / "faiss.index"))
meta = load_metadata(INDEX_DIR / "meta.json")
return index, meta
def search(query: str, k: int = 5):
index, meta = load_index()
model = SentenceTransformer(EMBEDDING_MODEL)
q = model.encode([query], normalize_embeddings=True)
D, I = index.search(np.array(q, dtype="float32"), k)
results = [{"score": float(D[0][j]), **meta[I[0][j]]} for j in range(len(I[0]))]
return results
if __name__ == "__main__":
from ingest import build_documents
docs = build_documents("./my_docs")
build_index(docs)
print(search("deployment strategies for Astro + Supabase"))
- Cosine similarity: Achieved via Inner Product on normalized embeddings (fast and simple).
- Caching: Re-run indexing only when documents change.
Running the local LLM with llama.cpp
Use llama-cpp-python to load a quantized GGUF model. Keep prompts short and include retrieved context.
# llm.py
from llama_cpp import Llama
MODEL_PATH = "./models/Mistral-7B-Instruct.Q4_K_M.gguf"
llm = Llama(
model_path=MODEL_PATH,
n_ctx=4096, # adjust based on model + memory
n_batch=128, # larger for throughput on CPU
temperature=0.2,
repeat_penalty=1.1,
)
SYSTEM_PROMPT = (
"You are a helpful assistant. Use only the provided context. "
"If the answer is not in the context, say you don't know."
)
def build_prompt(question: str, contexts: list[str]) -> str:
header = f"<s>[INST] <<SYS>>\n{SYSTEM_PROMPT}\n<</SYS>>\n"
ctx = "\n\n".join([f"[SOURCE] {i+1}\n{c}" for i, c in enumerate(contexts)])
user = f"\n\n[CONTEXT]\n{ctx}\n\n[QUESTION]\n{question} [/INST]"
return header + user
def generate(question: str, contexts: list[str]) -> str:
prompt = build_prompt(question, contexts)
out = llm(
prompt,
max_tokens=768,
stop=["</s>", "[/INST]"],
)
return out["choices"][0]["text"].strip()
- Context window: Set
n_ctxto 4096 if your model supports it; match your GGUF file’s recommended params. - Instruction format: For Mistral/Llama Instruct,
[INST]style is robust. Adapt if you use different models.
Wiring up retrieval-augmented generation (RAG)
Tie retrieval to LLM generation, include source snippets, and return grounded answers.
# app.py
from typing import List
from index import search
from llm import generate
def answer(question: str, k: int = 5) -> dict:
hits = search(question, k=k)
contexts = [h["text"] for h in hits]
text = generate(question, contexts)
return {
"answer": text,
"sources": [{"source": h["source"], "score": round(h["score"], 4)} for h in hits],
"context_preview": [c[:300] for c in contexts],
}
if __name__ == "__main__":
while True:
try:
q = input("\nQ: ").strip()
if not q:
continue
if q.lower() in {"exit", "quit"}:
break
result = answer(q, k=5)
print("\n--- Answer ---\n")
print(result["answer"])
print("\n--- Sources ---\n")
for s in result["sources"]:
print(f"- {s['source']} (score={s['score']})")
except KeyboardInterrupt:
break
- Grounding: Show source paths and scores so you know exactly where content came from.
- Fallback: If retrieval scores are low, prompt the LLM to say “I don’t know” rather than hallucinating.
Optional enhancements
Better chunking and metadata
- Semantic chunking: Use headings and semantic boundaries for Markdown (e.g., split by
#,##before word-based chunking). - Citation stitching: Return line numbers or section titles for clearer references in answers.
Speed and quality
- GPU acceleration: Build
llama-cpp-pythonwith CUDA or Metal for significant speedups. - Larger embeddings: Try
all-mpnet-base-v2if you want higher retrieval quality, at the cost of speed.
LangChain integration (if you want convenience)
# rag_langchain.py
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import LlamaCpp
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
embedding = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
# Assume `docs` is a list of {"text": "...", "source": "..."}
texts = [d["text"] for d in docs]
splits = text_splitter.create_documents(texts, metadatas=[{"source": d["source"]} for d in docs])
vectorstore = FAISS.from_documents(splits, embedding)
llm = LlamaCpp(
model_path="./models/Mistral-7B-Instruct.Q4_K_M.gguf",
n_ctx=4096,
temperature=0.2,
)
template = """You are a helpful assistant. Use ONLY the context below.
{context}
Question: {question}
If not in context, say you don't know."""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])
qa = RetrievalQA.from_chain_type(
llm=llm,
retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
chain_type="stuff",
chain_type_kwargs={"prompt": prompt},
)
- Trade-off: LangChain speeds up scaffolding but introduces abstraction; the custom approach is leaner and transparent.
What made this work well for me
- Privacy-first: Everything runs offline; no requests leave my machine.
- Modular pipeline: Ingestion → embeddings → FAISS → llama.cpp → prompt. Each layer is swappable.
- Deterministic answers: Grounded in my own documents, with explicit sources and a conservative prompt.
- Performance knobs: Quantized GGUF models + cosine similarity on normalized embeddings = fast retrieval, responsive answers.
Quick start checklist
- Collect: Put all docs in
./my_docs(Markdown, PDFs, Docx). - Index: Run
python index.pyafterpython ingest.pyto build chunks and FAISS. - Model: Download a GGUF instruct model to
./models. - Chat: Run
python app.pyand start asking questions about your content. - Iterate: Tune chunk size, k, and the system prompt based on responses.
If you want, share the types of docs you’re indexing and your hardware setup. I can suggest specific models, quantizations, and chunking strategies tailored to your workflow.