"""Load corpus documents from disk."""
from __future__ import annotations
from pathlib import Path
SUPPORTED_EXTENSIONS = {".md", ".csv", ".json", ".txt"}
def load_documents(corpus_dir: Path) -> list[dict]:
documents = []
for path in sorted(corpus_dir.rglob("*")):
if not path.is_file():
continue
if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
continue
text = path.read_text(encoding="utf-8")
documents.append(
{
"source": path.name,
"path": str(path),
"relative_path": str(path.relative_to(corpus_dir)),
"extension": path.suffix.lower(),
"document_type": path.parent.name,
"text": text,
}
)
return documents