"""Generate the synthetic reinsurance knowledge-base corpus.
This script is Phase 1 of the project.
It defines 30 document specs and turns each spec into one detailed OpenRouter prompt.
The generated documents are written to data/corpus/.
Run:
python scripts/generate_corpus.py --dry-run
python scripts/generate_corpus.py
Required for generation:
OPENROUTER_API_KEY
Optional:
GENERATION_MODEL
OPENROUTER_BASE_URL
"""
from __future__ import annotations
import argparse
import os
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
CORPUS_ROOT = PROJECT_ROOT / "data" / "corpus"
DEFAULT_BASE_URL = "https://openrouter.ai/api/v1"
DEFAULT_MODEL = "openai/gpt-5-mini"
DOCUMENT_TYPE_DIRS = {
"meeting_minutes": "meeting_minutes",
"proposal": "proposals",
"project_report": "project_reports",
"technical_report": "actuarial_reports",
"technical_spec": "actuarial_reports",
"claims_review": "project_reports",
"bordereaux_note": "actuarial_reports",
"team_note": "team_notes",
"structured_data": "structured_data",
"market_memo": "project_reports",
}
DOCUMENT_SPECS = [
{
"id": "D001",
"filename": "2024-03-12_triglav_adriatic_property_cat_kickoff.md",
"type": "meeting_minutes",
"client": "Triglav Adriatic Insurance",
"project": "Property Catastrophe XoL Renewal",
"date": "2024-03-12",
"title": "Property Catastrophe Renewal Kickoff Notes",
"length": "short",
"has_table": False,
"supports_questions": ["q001", "q002"],
"must_include": [
"Early working preference for a EUR 5m attachment point.",
"Clear note that exposure data was incomplete and not yet validated.",
"The EUR 5m figure is provisional, not final.",
"Attendees include Eva Kralj and Sara Kovac from the advisory team.",
],
"must_not_include": [
"Do not state that EUR 5m was the final selected attachment.",
"Do not mention the final EUR 7.5m attachment point.",
],
},
{
"id": "D002",
"filename": "2024-04-05_triglav_adriatic_exposure_extract_v1.csv",
"type": "structured_data",
"client": "Triglav Adriatic Insurance",
"project": "Property Catastrophe XoL Renewal",
"date": "2024-04-05",
"title": "Initial Property Exposure Extract",
"length": "short",
"has_table": True,
"supports_questions": ["q005"],
"must_include": [
"CSV rows by CRESTA zone.",
"Total insured value sums to EUR 1.80bn.",
"A status or note column marking the extract as preliminary.",
],
"must_not_include": [
"Do not include the final EUR 2.05bn total insured value.",
],
},
{
"id": "D003",
"filename": "2024-05-20_triglav_adriatic_cat_model_report.md",
"type": "technical_report",
"client": "Triglav Adriatic Insurance",
"project": "Property Catastrophe XoL Renewal",
"date": "2024-05-20",
"title": "Final Catastrophe Modelling Report",
"length": "long",
"has_table": True,
"supports_questions": ["q003", "q005"],
"must_include": [
"Final validated total insured value is EUR 2.05bn.",
"Explain that the 2024-04-05 preliminary extract was superseded after data cleansing.",
"Refer to the superseded extract by filename: 2024-04-05_triglav_adriatic_exposure_extract_v1.csv.",
"Sara Kovac prepared the catastrophe modelling analysis.",
"Include at least two Markdown tables, including one CRESTA zone table.",
"Use multiple sections and appendices so the document feels longer than 5 pages.",
],
"must_not_include": [
"Do not say EUR 1.80bn is the latest total insured value.",
],
},
{
"id": "D004",
"filename": "2024-06-10_triglav_adriatic_property_cat_final_recommendation.md",
"type": "proposal",
"client": "Triglav Adriatic Insurance",
"project": "Property Catastrophe XoL Renewal",
"date": "2024-06-10",
"title": "Final Property Catastrophe Renewal Recommendation",
"length": "medium",
"has_table": True,
"supports_questions": ["q001", "q002", "q003", "q004"],
"must_include": [
"Final selected attachment point is EUR 7.5m.",
"Mention that an earlier EUR 5m working option was rejected.",
"Helvetia Re is selected as lead reinsurer with a 22.5% signed line.",
"Assigned internal team includes Eva Kralj and Sara Kovac.",
"Include a treaty structure table.",
],
"must_not_include": [
"Do not call EUR 5m the final attachment point.",
],
},
{
"id": "D005",
"filename": "2024-06-04_triglav_adriatic_broker_market_feedback.md",
"type": "market_memo",
"client": "Triglav Adriatic Insurance",
"project": "Property Catastrophe XoL Renewal",
"date": "2024-06-04",
"title": "Broker Market Feedback Memo",
"length": "short",
"has_table": False,
"supports_questions": ["q004"],
"must_include": [
"Helvetia Re offered the most stable lead terms.",
"Two alternative markets declined to lead below EUR 7.5m attachment.",
"Recommendation to use Helvetia Re as lead market.",
],
"must_not_include": [
"Do not introduce a different final lead reinsurer.",
],
},
{
"id": "D006",
"filename": "2024-02-28_sava_danube_solvency_capital_model_note.md",
"type": "technical_report",
"client": "Sava Danube Re",
"project": "Retrocession and Solvency II Capital Review",
"date": "2024-02-28",
"title": "Solvency II Capital Model Technical Note",
"length": "long",
"has_table": True,
"supports_questions": ["q012"],
"must_include": [
"Solvency II capital modelling was performed for Sava Danube Re.",
"Main lesson: align retrocession assumptions with the internal model before market submission.",
"Include SCR impact tables for at least three retrocession options.",
"Use multiple sections and appendices so the document feels longer than 5 pages.",
],
"must_not_include": [
"Do not mention Triglav's EUR 7.5m attachment point.",
],
},
{
"id": "D007",
"filename": "2024-03-18_sava_danube_retrocession_placement_summary.md",
"type": "proposal",
"client": "Sava Danube Re",
"project": "Retrocession and Solvency II Capital Review",
"date": "2024-03-18",
"title": "Retrocession Placement Summary",
"length": "medium",
"has_table": True,
"supports_questions": ["q012", "q013"],
"must_include": [
"Selected event limit is EUR 40m.",
"The EUR 40m limit was chosen after capital model sensitivity testing.",
"Include a placement terms table with limit, attachment, reinstatement, and premium.",
],
"must_not_include": [
"Do not say the selected event limit was EUR 25m.",
],
},
{
"id": "D008",
"filename": "2024-01-22_merkur_mutual_motor_qs_kickoff.md",
"type": "meeting_minutes",
"client": "Merkur Mutual",
"project": "Motor Quota Share Renewal",
"date": "2024-01-22",
"title": "Motor Quota Share Kickoff Notes",
"length": "short",
"has_table": False,
"supports_questions": ["q007"],
"must_include": [
"Initial target ceding commission was 29.0%.",
"The 29.0% figure was based on incomplete 2023 claims data.",
"The team agreed to revisit the commission after the updated loss triangle.",
],
"must_not_include": [
"Do not say 29.0% was the final recommendation.",
],
},
{
"id": "D009",
"filename": "2024-02-06_merkur_mutual_motor_loss_triangle.csv",
"type": "structured_data",
"client": "Merkur Mutual",
"project": "Motor Quota Share Renewal",
"date": "2024-02-06",
"title": "Motor Loss Triangle Extract",
"length": "short",
"has_table": True,
"supports_questions": ["q006", "q007"],
"must_include": [
"CSV accident-year loss triangle.",
"Updated 2023 ultimate loss ratio is materially better than the kickoff assumption.",
"Include paid loss, case reserve, and ultimate loss columns.",
],
"must_not_include": [
"Do not include prose outside the CSV.",
],
},
{
"id": "D010",
"filename": "2024-02-20_merkur_mutual_motor_qs_final_proposal.md",
"type": "proposal",
"client": "Merkur Mutual",
"project": "Motor Quota Share Renewal",
"date": "2024-02-20",
"title": "Final Motor Quota Share Proposal",
"length": "medium",
"has_table": True,
"supports_questions": ["q006", "q007"],
"must_include": [
"Final recommended ceding commission is 31.5%.",
"Explain that the kickoff 29.0% target was revised upward after updated loss data.",
"Include a treaty economics table.",
],
"must_not_include": [
"Do not state that 29.0% is still recommended.",
],
},
{
"id": "D011",
"filename": "2024-02-26_merkur_mutual_underwriting_guideline_review.md",
"type": "project_report",
"client": "Merkur Mutual",
"project": "Motor Quota Share Renewal",
"date": "2024-02-26",
"title": "Motor Underwriting Guideline Review",
"length": "medium",
"has_table": False,
"supports_questions": ["q006"],
"must_include": [
"Underwriting guideline changes support the 31.5% final commission.",
"Mention restrictions on high-frequency fleet segments.",
"Mention Luka Horvat as actuarial pricing reviewer.",
],
"must_not_include": [
"Do not introduce a new final ceding commission.",
],
},
{
"id": "D012",
"filename": "2024-04-02_adriatic_health_stop_loss_kickoff.md",
"type": "meeting_minutes",
"client": "Adriatic Health Insurance",
"project": "Health Stop-Loss Placement",
"date": "2024-04-02",
"title": "Health Stop-Loss Kickoff Notes",
"length": "short",
"has_table": False,
"supports_questions": ["q010", "q011"],
"must_include": [
"Initial working deductible was EUR 1.0m.",
"Claims data was not final at kickoff.",
"The deductible would be revisited after updated large-claim analysis.",
],
"must_not_include": [
"Do not say EUR 1.0m was the bound deductible.",
],
},
{
"id": "D013",
"filename": "2024-04-12_adriatic_health_large_claims_triangle.csv",
"type": "structured_data",
"client": "Adriatic Health Insurance",
"project": "Health Stop-Loss Placement",
"date": "2024-04-12",
"title": "Large Claims Triangle Extract",
"length": "short",
"has_table": True,
"supports_questions": ["q011"],
"must_include": [
"CSV triangle for large medical claims.",
"Show that late development increased after the prior review.",
"Include accident year, reported claims, paid claims, outstanding, and ultimate estimate.",
],
"must_not_include": [
"Do not include Markdown formatting.",
],
},
{
"id": "D014",
"filename": "2024-04-08_adriatic_health_initial_claims_review.md",
"type": "claims_review",
"client": "Adriatic Health Insurance",
"project": "Health Stop-Loss Placement",
"date": "2024-04-08",
"title": "Initial Stop-Loss Claims Review",
"length": "short",
"has_table": False,
"supports_questions": ["q011"],
"must_include": [
"Initial expected claims ratio was 76%.",
"The review predates the large-claims triangle update.",
"The 76% figure is marked preliminary.",
],
"must_not_include": [
"Do not say 76% is the final expected claims ratio.",
],
},
{
"id": "D015",
"filename": "2024-04-22_adriatic_health_final_pricing_note.md",
"type": "technical_report",
"client": "Adriatic Health Insurance",
"project": "Health Stop-Loss Placement",
"date": "2024-04-22",
"title": "Final Stop-Loss Pricing Note",
"length": "medium",
"has_table": True,
"supports_questions": ["q010", "q011"],
"must_include": [
"Latest expected claims ratio is 82%.",
"Explain that the 76% initial ratio was superseded by updated large-claim development.",
"Recommend a EUR 1.2m deductible.",
"Include a pricing sensitivity table.",
],
"must_not_include": [
"Do not present 76% as current.",
],
},
{
"id": "D016",
"filename": "2024-05-03_adriatic_health_stop_loss_signed_summary.md",
"type": "proposal",
"client": "Adriatic Health Insurance",
"project": "Health Stop-Loss Placement",
"date": "2024-05-03",
"title": "Signed Stop-Loss Placement Summary",
"length": "short",
"has_table": True,
"supports_questions": ["q010"],
"must_include": [
"Bound deductible is EUR 1.2m.",
"Mention that the EUR 1.0m kickoff option was not bound.",
"Include a final placement table.",
],
"must_not_include": [
"Do not say the bound deductible is EUR 1.0m.",
],
},
{
"id": "D017",
"filename": "2024-03-05_balkan_motor_bordereaux_audit.md",
"type": "bordereaux_note",
"client": "Balkan Motor Pool",
"project": "Motor Pool Bordereaux Quality Review",
"date": "2024-03-05",
"title": "Bordereaux Data Quality Audit",
"length": "medium",
"has_table": True,
"supports_questions": ["q008"],
"must_include": [
"Tim Zupan led the bordereaux data quality audit.",
"Main issues: missing vehicle class, inconsistent claim status, delayed reserve updates.",
"Include a data quality findings table.",
],
"must_not_include": [
"Do not assign the audit lead to Sara Kovac.",
],
},
{
"id": "D018",
"filename": "2024-03-08_balkan_motor_bordereaux_quality_metrics.json",
"type": "structured_data",
"client": "Balkan Motor Pool",
"project": "Motor Pool Bordereaux Quality Review",
"date": "2024-03-08",
"title": "Bordereaux Quality Metrics",
"length": "short",
"has_table": True,
"supports_questions": ["q008"],
"must_include": [
"Valid JSON only.",
"Metrics for missing_vehicle_class, inconsistent_claim_status, and delayed_reserve_updates.",
"A reviewer field with Tim Zupan.",
],
"must_not_include": [
"Do not wrap JSON in Markdown fences.",
],
},
{
"id": "D019",
"filename": "2024-03-21_balkan_motor_claims_trend_report.md",
"type": "claims_review",
"client": "Balkan Motor Pool",
"project": "Motor Pool Bordereaux Quality Review",
"date": "2024-03-21",
"title": "Motor Pool Claims Trend Report",
"length": "medium",
"has_table": True,
"supports_questions": ["q008"],
"must_include": [
"Claims trend analysis depends on corrected bordereaux fields.",
"Tim Zupan validated the corrected claim status mapping.",
"Include a claims frequency table.",
],
"must_not_include": [
"Do not state that bordereaux quality was perfect.",
],
},
{
"id": "D020",
"filename": "2024-05-14_ljubljana_specialty_fac_property_meeting.md",
"type": "meeting_minutes",
"client": "Ljubljana Specialty Underwriters",
"project": "Facultative Property Placement",
"date": "2024-05-14",
"title": "Facultative Property Placement Meeting Notes",
"length": "short",
"has_table": False,
"supports_questions": [],
"must_include": [
"Discussion of a single industrial property placement.",
"Risk improvements required before binding.",
],
"must_not_include": [
"Do not mention a cyber catastrophe treaty.",
"Do not mention Nova Kredit.",
],
},
{
"id": "D021",
"filename": "2024-05-17_ljubljana_specialty_fac_quote_comparison.csv",
"type": "structured_data",
"client": "Ljubljana Specialty Underwriters",
"project": "Facultative Property Placement",
"date": "2024-05-17",
"title": "Facultative Quote Comparison",
"length": "short",
"has_table": True,
"supports_questions": [],
"must_include": [
"CSV quote comparison with reinsurer, share, rate, deductible, and subjectivities.",
"At least three reinsurer quote rows.",
],
"must_not_include": [
"Do not include unrelated treaty portfolio data.",
],
},
{
"id": "D022",
"filename": "2024-05-24_ljubljana_specialty_fac_bind_memo.md",
"type": "proposal",
"client": "Ljubljana Specialty Underwriters",
"project": "Facultative Property Placement",
"date": "2024-05-24",
"title": "Facultative Property Bind Memo",
"length": "medium",
"has_table": True,
"supports_questions": [],
"must_include": [
"Final facultative property placement terms.",
"Risk improvement subjectivities were satisfied.",
"Include a bound shares table.",
],
"must_not_include": [
"Do not mention Triglav Adriatic's property catastrophe treaty.",
],
},
{
"id": "D023",
"filename": "2024-01-10_team_capability_matrix.md",
"type": "team_note",
"client": "Internal",
"project": "Team Capability Register",
"date": "2024-01-10",
"title": "Reinsurance Analytics Team Capability Matrix",
"length": "medium",
"has_table": True,
"supports_questions": ["q003", "q008", "q012"],
"must_include": [
"Sara Kovac has prior catastrophe modelling experience.",
"Tim Zupan has bordereaux data quality and claims analytics experience.",
"Maja Novak has Solvency II capital modelling experience.",
"Include a team capability table.",
],
"must_not_include": [
"Do not say every team member has every skill.",
],
},
{
"id": "D024",
"filename": "2024-04-30_team_resourcing_update.md",
"type": "team_note",
"client": "Internal",
"project": "Team Resourcing",
"date": "2024-04-30",
"title": "April Team Resourcing Update",
"length": "short",
"has_table": False,
"supports_questions": ["q015"],
"must_include": [
"Sara Kovac is expected to remain available for Triglav Adriatic through May.",
"The note is an April planning assumption.",
],
"must_not_include": [
"Do not mention the later May reassignment.",
],
},
{
"id": "D025",
"filename": "2024-02-12_sava_danube_market_pricing_outdated_memo.md",
"type": "market_memo",
"client": "Sava Danube Re",
"project": "Retrocession and Solvency II Capital Review",
"date": "2024-02-12",
"title": "Early Retrocession Market Pricing Memo",
"length": "short",
"has_table": False,
"supports_questions": ["q013"],
"must_include": [
"Early market estimate considered a EUR 25m event limit.",
"The memo says the limit is subject to capital modelling.",
"Mark the terms as non-binding early guidance.",
],
"must_not_include": [
"Do not say EUR 25m was finally selected.",
],
},
{
"id": "D026",
"filename": "2024-06-28_reinsurance_lessons_learned_register.md",
"type": "project_report",
"client": "Internal",
"project": "Lessons Learned Register",
"date": "2024-06-28",
"title": "Reinsurance Projects Lessons Learned Register",
"length": "medium",
"has_table": True,
"supports_questions": ["q012"],
"must_include": [
"Sava Danube Re is listed as a Solvency II capital modelling project.",
"Main lesson: align retrocession assumptions with the internal model before market submission.",
"Include a lessons learned table with client, project, topic, and lesson.",
],
"must_not_include": [
"Do not list Nova Kredit.",
],
},
{
"id": "D027",
"filename": "2024-05-02_cat_modelling_availability_note.md",
"type": "team_note",
"client": "Internal",
"project": "Team Resourcing",
"date": "2024-05-02",
"title": "Catastrophe Modelling Availability Note",
"length": "short",
"has_table": False,
"supports_questions": ["q015"],
"must_include": [
"Sara Kovac was initially marked available for Triglav Adriatic after May 2.",
"This is an early May availability note.",
],
"must_not_include": [
"Do not mention the May 16 reassignment.",
],
},
{
"id": "D028",
"filename": "2024-05-16_staffing_reassignment_update.md",
"type": "team_note",
"client": "Internal",
"project": "Team Resourcing",
"date": "2024-05-16",
"title": "Staffing Reassignment Update",
"length": "short",
"has_table": False,
"supports_questions": ["q015"],
"must_include": [
"Sara Kovac was reassigned away from Triglav Adriatic after the May modelling handover.",
"Maja Novak became the escalation contact for catastrophe modelling questions.",
"This update supersedes earlier May availability notes.",
],
"must_not_include": [
"Do not say Sara remained fully available after May 16.",
],
},
{
"id": "D029",
"filename": "2024-07-01_cross_client_treaty_summary.json",
"type": "structured_data",
"client": "Internal",
"project": "Cross-Client Treaty Summary",
"date": "2024-07-01",
"title": "Cross-Client Treaty Summary",
"length": "short",
"has_table": True,
"supports_questions": ["q001", "q006", "q010", "q013"],
"must_include": [
"Valid JSON only.",
"Entries for Triglav Adriatic, Sava Danube Re, Merkur Mutual, and Adriatic Health.",
"Triglav final attachment EUR 7.5m.",
"Sava Danube selected event limit EUR 40m.",
"Merkur final ceding commission 31.5%.",
"Adriatic Health bound deductible EUR 1.2m.",
],
"must_not_include": [
"Do not include Nova Kredit.",
],
},
{
"id": "D030",
"filename": "2024-07-05_annual_reinsurance_knowledge_digest.md",
"type": "project_report",
"client": "Internal",
"project": "Annual Knowledge Digest",
"date": "2024-07-05",
"title": "Annual Reinsurance Knowledge Digest",
"length": "long",
"has_table": True,
"supports_questions": ["q008", "q012"],
"must_include": [
"Summarize cross-client lessons from the first half of 2024.",
"Mention Sava Danube Re's Solvency II capital modelling lesson: align retrocession assumptions with the internal model before market submission.",
"Mention Balkan Motor Pool's bordereaux quality lesson.",
"Include multiple sections and at least two Markdown tables.",
"Make it long enough to credibly represent more than 5 pages.",
],
"must_not_include": [
"Do not introduce new final terms that conflict with source project documents.",
],
},
]
def main() -> None:
args = parse_args()
selected_specs = select_specs(args.only, args.limit)
if args.show_prompt:
print_prompt(args.show_prompt)
return
if args.dry_run:
print_plan(selected_specs)
return
load_env_file(PROJECT_ROOT / ".env")
api_key = os.getenv("OPENROUTER_API_KEY")
if not api_key:
raise SystemExit("Missing OPENROUTER_API_KEY. Use --dry-run to inspect prompts.")
client = create_openrouter_client(api_key)
model = args.model or os.getenv("GENERATION_MODEL") or DEFAULT_MODEL
for spec in selected_specs:
output_path = output_path_for(spec)
if output_path.exists() and not args.overwrite:
print(f"Skipping existing file: {output_path.relative_to(PROJECT_ROOT)}")
continue
prompt = build_prompt(spec)
content = generate_document(client, model, prompt)
write_document(output_path, content)
print(f"Wrote {output_path.relative_to(PROJECT_ROOT)}")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Generate synthetic reinsurance corpus.")
parser.add_argument("--dry-run", action="store_true", help="Print document plan only.")
parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files.")
parser.add_argument("--limit", type=int, help="Generate only the first N selected docs.")
parser.add_argument("--only", nargs="*", help="Generate only selected document IDs.")
parser.add_argument("--show-prompt", help="Print the prompt for one document ID.")
parser.add_argument("--model", help="OpenRouter model name.")
return parser.parse_args()
def select_specs(only: list[str] | None, limit: int | None) -> list[dict]:
specs = DOCUMENT_SPECS
if only:
requested_ids = set(only)
specs = [spec for spec in specs if spec["id"] in requested_ids]
if limit is not None:
specs = specs[:limit]
return specs
def print_plan(specs: list[dict]) -> None:
for spec in specs:
output_path = output_path_for(spec)
question_ids = ", ".join(spec["supports_questions"]) or "none"
table_flag = "table" if spec["has_table"] else "no table"
print(f"{spec['id']} -> {output_path.relative_to(PROJECT_ROOT)}")
print(f" {spec['type']} | {spec['client']} | {spec['length']} | {table_flag}")
print(f" supports: {question_ids}")
print()
def print_prompt(document_id: str) -> None:
spec = find_spec(document_id)
if not spec:
raise SystemExit(f"Unknown document ID: {document_id}")
print(build_prompt(spec))
def find_spec(document_id: str) -> dict | None:
for spec in DOCUMENT_SPECS:
if spec["id"] == document_id:
return spec
return None
def create_openrouter_client(api_key: str):
try:
from openai import OpenAI
except ImportError as exc:
raise SystemExit("Install the openai package before generating documents.") from exc
base_url = os.getenv("OPENROUTER_BASE_URL", DEFAULT_BASE_URL)
return OpenAI(api_key=api_key, base_url=base_url)
def load_env_file(path: Path) -> None:
if not path.exists():
return
for raw_line in path.read_text(encoding="utf-8").splitlines():
line = raw_line.strip()
if not line:
continue
if line.startswith("#"):
continue
if "=" not in line:
continue
key, value = line.split("=", 1)
key = key.strip()
value = value.strip().strip('"').strip("'")
if key and key not in os.environ:
os.environ[key] = value
def generate_document(client, model: str, prompt: str) -> str:
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": (
"You generate realistic synthetic business documents for a RAG "
"evaluation corpus. Follow the requested format exactly. Never include "
"Markdown code fences around the final document."
),
},
{"role": "user", "content": prompt},
],
temperature=0.4,
)
content = response.choices[0].message.content
if not content:
raise RuntimeError("OpenRouter returned an empty document.")
return content.strip() + "\n"
def write_document(output_path: Path, content: str) -> None:
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(content, encoding="utf-8")
def output_path_for(spec: dict) -> Path:
folder = DOCUMENT_TYPE_DIRS[spec["type"]]
return CORPUS_ROOT / folder / spec["filename"]
def build_prompt(spec: dict) -> str:
format_instructions = format_instructions_for(spec)
must_include = bullet_list(spec["must_include"])
must_not_include = bullet_list(spec["must_not_include"])
supports_questions = ", ".join(spec["supports_questions"]) or "none directly"
return f"""
Create one synthetic reinsurance advisory document.
Document identity:
- Document ID: {spec["id"]}
- Filename: {spec["filename"]}
- Title: {spec["title"]}
- Client: {spec["client"]}
- Project: {spec["project"]}
- Date: {spec["date"]}
- Document type: {spec["type"]}
- Length target: {spec["length"]}
- Contains table or structured data: {spec["has_table"]}
- Evaluation questions supported: {supports_questions}
Facts that must appear:
{must_include}
Facts that must not appear:
{must_not_include}
Writing requirements:
- Make the content realistic for an internal reinsurance advisory knowledge base.
- Keep all names, clients, reinsurers, figures, and projects fictional.
- Preserve the exact required figures and dates.
- Do not add facts that would answer unanswerable evaluation questions.
- Include enough context for retrieval, but keep the document focused.
- If this is a long document, use multiple sections, tables, and appendices.
- If this is Markdown, include YAML frontmatter with title, client, project, date,
document_type, document_id, and supports_questions.
- If this is Markdown, use clear section headings so citations can point to sections.
Output format:
{format_instructions}
""".strip()
def format_instructions_for(spec: dict) -> str:
filename = spec["filename"]
if filename.endswith(".csv"):
return (
"Return valid CSV only. Include a header row. Do not use Markdown fences, "
"comments, or explanatory prose."
)
if filename.endswith(".json"):
return (
"Return valid JSON only. Use clear keys and arrays where useful. Do not use "
"Markdown fences or explanatory prose."
)
return (
"Return Markdown only. Start with YAML frontmatter. Do not use Markdown fences "
"around the document."
)
def bullet_list(items: list[str]) -> str:
return "\n".join(f"- {item}" for item in items)
if __name__ == "__main__":
main()