"""ChromaDB persistence and search helpers."""
from __future__ import annotations
from pathlib import Path
COLLECTION_NAME = "reinsurance_knowledge_base"
def create_client(index_dir: Path):
import chromadb
index_dir.mkdir(parents=True, exist_ok=True)
return chromadb.PersistentClient(path=str(index_dir))
def reset_collection(index_dir: Path):
client = create_client(index_dir)
existing_names = [collection.name for collection in client.list_collections()]
if COLLECTION_NAME in existing_names:
client.delete_collection(COLLECTION_NAME)
return client.create_collection(name=COLLECTION_NAME)
def get_collection(index_dir: Path):
client = create_client(index_dir)
return client.get_collection(name=COLLECTION_NAME)
def add_chunks(collection, chunks: list[dict], embeddings: list[list[float]]) -> None:
if not chunks:
return
collection.add(
ids=[chunk["id"] for chunk in chunks],
embeddings=embeddings,
documents=[chunk["searchable_text"] for chunk in chunks],
metadatas=[metadata_for(chunk) for chunk in chunks],
)
def search(collection, query_embedding: list[float], top_k: int) -> list[dict]:
result = collection.query(query_embeddings=[query_embedding], n_results=top_k)
ids = result.get("ids", [[]])[0]
documents = result.get("documents", [[]])[0]
metadatas = result.get("metadatas", [[]])[0]
distances = result.get("distances", [[]])[0]
rows = []
for index, chunk_id in enumerate(ids):
rows.append(
{
"id": chunk_id,
"text": documents[index],
"metadata": metadatas[index],
"distance": distances[index],
}
)
return rows
def metadata_for(chunk: dict) -> dict:
return {
"source": chunk["source"],
"relative_path": chunk["relative_path"],
"document_type": chunk["document_type"],
"section": chunk["section"],
}