GigaProjects

← Back to rag-assistant

documents.py

"""Load corpus documents from disk."""

from __future__ import annotations

from pathlib import Path


SUPPORTED_EXTENSIONS = {".md", ".csv", ".json", ".txt"}


def load_documents(corpus_dir: Path) -> list[dict]:
    documents = []

    for path in sorted(corpus_dir.rglob("*")):
        if not path.is_file():
            continue
        if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
            continue

        text = path.read_text(encoding="utf-8")
        documents.append(
            {
                "source": path.name,
                "path": str(path),
                "relative_path": str(path.relative_to(corpus_dir)),
                "extension": path.suffix.lower(),
                "document_type": path.parent.name,
                "text": text,
            }
        )

    return documents

Run this code