import json
import re
from pathlib import Path

import requests
from bs4 import BeautifulSoup

WIKI_URL = "https://en.wikipedia.org/wiki/Tourism_in_Cambodia"
OUTPUT_FILE = Path("data/wiki_tourism_cambodia.jsonl")


def normalize_text(text: str) -> str:
    text = re.sub(r"\s+", " ", text).strip()
    return text


def extract_sections(html: str) -> list[dict]:
    soup = BeautifulSoup(html, "html.parser")
    content = soup.find(id="mw-content-text")
    if not content:
        raise RuntimeError("Could not find Wikipedia content area")

    page_title = soup.find(id="firstHeading").get_text(strip=True)
    sections = []
    current_section = {
        "title": "Introduction",
        "content": [],
    }

    for node in content.find_all(recursive=False):
        if node.name == "p":
            text = normalize_text(node.get_text())
            if text:
                current_section["content"].append(text)
        elif node.name and node.name.startswith("h") and node.name[1:].isdigit():
            if current_section["content"]:
                sections.append({
                    "section_title": current_section["title"],
                    "content": "\n\n".join(current_section["content"]),
                })
            current_section = {
                "title": node.get_text(strip=True),
                "content": [],
            }
        elif node.name in {"ul", "ol"}:
            items = [normalize_text(li.get_text()) for li in node.find_all("li")]
            if items:
                current_section["content"].append("; ".join(items))

    if current_section["content"]:
        sections.append({
            "section_title": current_section["title"],
            "content": "\n\n".join(current_section["content"]),
        })

    return [{"title": page_title, "source_url": WIKI_URL, **section} for section in sections]


def save_jsonl(entries: list[dict], output_path: Path) -> None:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with output_path.open("w", encoding="utf-8") as f:
        for entry in entries:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")


def main() -> None:
    print(f"Fetching {WIKI_URL}")
    response = requests.get(WIKI_URL, timeout=30)
    response.raise_for_status()

    print("Extracting sections...")
    sections = extract_sections(response.text)

    print(f"Saving {len(sections)} sections to {OUTPUT_FILE}")
    save_jsonl(sections, OUTPUT_FILE)
    print("Done. You can use this file for training or prompt data.")


if __name__ == "__main__":
    main()
