GigaProjects

← Back to rag-assistant

build_index.py

"""Build the local ChromaDB retrieval index."""

from __future__ import annotations

import sys
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))

from src.chunking import chunk_documents
from src.config import CORPUS_DIR, INDEX_DIR, get_embedding_model_name
from src.documents import load_documents
from src.embeddings import embed_texts
from src.index_store import add_chunks, reset_collection


def main() -> None:
    print(f"Loading documents from {CORPUS_DIR}")
    documents = load_documents(CORPUS_DIR)
    print(f"Loaded {len(documents)} documents")

    chunks = chunk_documents(documents)
    print(f"Created {len(chunks)} chunks")

    if not chunks:
        print("No chunks to index")
        return

    print(f"Embedding chunks with {get_embedding_model_name()}")
    embeddings = embed_texts([chunk["searchable_text"] for chunk in chunks])

    print(f"Writing ChromaDB index to {INDEX_DIR}")
    collection = reset_collection(INDEX_DIR)
    add_chunks(collection, chunks, embeddings)

    print(f"Indexed {len(chunks)} chunks")


if __name__ == "__main__":
    main()

Run this code