#!/usr/bin/env python3
"""Polite, resumable downloader for sites.gold.ac.uk (WordPress) pages.

Uses curl for fetching (more reliable than requests in this environment).
Respects robots Crawl-delay by default.

Output: one "project package" folder per page (later we can refine to only project pages).
"""

import argparse
import csv
import hashlib
import json
import os
import random
import re
import subprocess
import time
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable, List, Optional
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup

DEFAULT_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36"


def slugify(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9]+", "-", s)
    s = re.sub(r"-+", "-", s).strip("-")
    return s or "untitled"


def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)


def polite_sleep(base: float, jitter: float) -> None:
    time.sleep(base + random.uniform(0, jitter))


def curl_fetch(url: str, out: Optional[Path] = None, timeout: int = 40) -> bytes:
    cmd = [
        "curl",
        "-sS",
        "-L",
        "--max-time",
        str(timeout),
        "-H",
        f"User-Agent: {DEFAULT_UA}",
        url,
    ]
    data = subprocess.check_output(cmd)
    if out is not None:
        ensure_dir(out.parent)
        out.write_bytes(data)
    return data


def read_robots_crawl_delay(robots_url: str) -> Optional[float]:
    try:
        txt = curl_fetch(robots_url, timeout=30).decode("utf-8", errors="ignore")
    except Exception:
        return None
    # naive parse: first Crawl-delay under User-agent: *
    ua_star = False
    for line in txt.splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if line.lower().startswith("user-agent:"):
            ua = line.split(":", 1)[1].strip()
            ua_star = (ua == "*")
        if ua_star and line.lower().startswith("crawl-delay:"):
            v = line.split(":", 1)[1].strip()
            try:
                return float(v)
            except Exception:
                return None
    return None


def parse_sitemap_urls(xml_bytes: bytes) -> List[str]:
    soup = BeautifulSoup(xml_bytes, "xml")
    out = []
    for loc in soup.find_all("loc"):
        if loc.text:
            out.append(loc.text.strip())
    return out


def choose_best_src(img_tag) -> Optional[str]:
    srcset = img_tag.get("srcset")
    if srcset:
        cands = []
        for part in srcset.split(","):
            part = part.strip()
            if not part:
                continue
            bits = part.split()
            url = bits[0]
            w = 0
            if len(bits) > 1:
                m = re.match(r"(\d+)(w|x)", bits[1])
                if m:
                    w = int(m.group(1))
            cands.append((w, url))
        cands.sort(key=lambda x: x[0], reverse=True)
        return cands[0][1] if cands else None
    return img_tag.get("src")


@dataclass
class Extracted:
    title: str
    text_md: str
    image_urls: List[str]
    outgoing_links: List[str]


def extract_page(html: bytes, base_url: str) -> Extracted:
    soup = BeautifulSoup(html, "html.parser")
    title = ""
    h1 = soup.find("h1")
    if h1 and h1.get_text(strip=True):
        title = h1.get_text(" ", strip=True)
    if not title and soup.title:
        title = soup.title.get_text(" ", strip=True)

    main = soup.find("main") or soup.body or soup

    parts = []
    for el in main.find_all(["h1", "h2", "h3", "p", "li"], recursive=True):
        txt = el.get_text(" ", strip=True)
        if not txt or len(txt) < 2:
            continue
        parts.append(txt)
    # de-dup
    cleaned = []
    prev = None
    for t in parts:
        if t == prev:
            continue
        cleaned.append(t)
        prev = t
    text_md = "\n\n".join(cleaned[:300]).strip()

    imgs = []
    for img in main.find_all("img"):
        u = choose_best_src(img)
        if not u:
            continue
        imgs.append(urljoin(base_url, u))

    links = []
    for a in main.find_all("a"):
        href = a.get("href")
        if not href:
            continue
        links.append(urljoin(base_url, href))

    def uniq(seq: Iterable[str]) -> List[str]:
        seen = set(); out = []
        for x in seq:
            if x in seen:
                continue
            seen.add(x); out.append(x)
        return out

    return Extracted(title=title.strip(), text_md=text_md, image_urls=uniq(imgs), outgoing_links=uniq(links))


def sha256_file(p: Path) -> str:
    h = hashlib.sha256()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


def copy_template(template_dir: Path, dest_dir: Path) -> None:
    import shutil

    if dest_dir.exists():
        return
    shutil.copytree(template_dir, dest_dir)


def download_images(image_urls: List[str], originals_dir: Path, delay_s: float, jitter_s: float, allow_host: str) -> List[str]:
    saved = []
    for i, u in enumerate(image_urls, start=1):
        if urlparse(u).netloc != allow_host:
            continue
        ext = os.path.splitext(urlparse(u).path)[1].lower()
        if ext not in {".jpg", ".jpeg", ".png", ".webp", ".gif"}:
            ext = ".jpg"
        out = originals_dir / f"img_{i:03d}{ext}"
        if out.exists():
            saved.append(str(out))
            continue
        try:
            curl_fetch(u, out=out, timeout=60)
            saved.append(str(out))
        except Exception:
            continue
        polite_sleep(delay_s, jitter_s)
    return saved


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--start-url", required=True)
    ap.add_argument("--sitemap", required=True)
    ap.add_argument("--program", required=True)
    ap.add_argument("--year", type=int, required=True)
    ap.add_argument("--out-root", default=str(Path.home() / "Desktop" / "Goldsmiths_MediaLibrary"))
    ap.add_argument("--template", default=str(Path.home() / "Desktop" / "RCA_MediaLibrary" / "_TEMPLATE" / "project_package"))
    ap.add_argument("--state", default=str(Path.home() / "Desktop" / "Goldsmiths_MediaLibrary" / "_state.json"))
    ap.add_argument("--delay", type=float, default=30.0)
    ap.add_argument("--jitter", type=float, default=8.0)
    ap.add_argument("--max-pages", type=int, default=300)
    args = ap.parse_args()

    start = args.start_url.rstrip("/") + "/"
    host = urlparse(start).netloc

    # robots delay
    robots_url = urljoin(start, "robots.txt")
    rd = read_robots_crawl_delay(robots_url)
    if rd:
        args.delay = max(args.delay, rd)

    out_root = Path(args.out_root)
    template_dir = Path(args.template)
    state_path = Path(args.state)

    ensure_dir(out_root)

    state = {"done": [], "queue": [], "errors": []}
    if state_path.exists():
        try:
            state = json.loads(state_path.read_text(encoding="utf-8"))
        except Exception:
            pass

    done = set(state.get("done", []))
    queue = state.get("queue", [])

    if not queue:
        xml = curl_fetch(args.sitemap)
        urls = parse_sitemap_urls(xml)
        # filter: keep urls under start path
        start_path = urlparse(start).path.rstrip("/")
        for u in urls:
            if urlparse(u).path.startswith(start_path):
                queue.append(u)
        # ensure start-url first
        if args.start_url not in queue:
            queue.insert(0, args.start_url)
        else:
            queue.remove(args.start_url)
            queue.insert(0, args.start_url)

    program_slug = slugify(args.program)
    base_dir = out_root / program_slug / str(args.year)
    ensure_dir(base_dir / "00_index")
    ensure_dir(base_dir / "artists")
    ensure_dir(base_dir / "projects")

    master_csv = base_dir / "00_index" / "projects_master.csv"
    if not master_csv.exists():
        master_csv.write_text("program,grad_year,project_title,project_url,cover_image_path,image_count,status,notes\n", encoding="utf-8")

    processed = 0
    while queue and processed < args.max_pages:
        url = queue.pop(0)
        if url in done:
            continue

        try:
            html = curl_fetch(url)
        except Exception as e:
            state.setdefault("errors", []).append({"url": url, "err": str(e)})
            polite_sleep(args.delay * 1.5, args.jitter)
            continue

        extracted = extract_page(html, url)
        title = extracted.title or url
        project_slug = slugify(title)[:80]

        project_dir = base_dir / "projects" / f"{project_slug}__unknown__goldsmiths__{args.year}"
        copy_template(template_dir, project_dir)

        raw_dir = project_dir / "00_source_page"
        ensure_dir(raw_dir)
        (raw_dir / "page.html").write_bytes(html)

        # minimal fill
        (project_dir / "00_admin" / "source_url.txt").write_text(url + "\n", encoding="utf-8")
        meta = {
            "captured_at_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
            "captured_by": "openclaw",
            "user_agent": DEFAULT_UA,
            "notes": "polite-crawl: robots crawl-delay respected",
        }
        (project_dir / "00_admin" / "capture_meta.json").write_text(json.dumps(meta, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")

        # project profile yaml quick fill
        pp = project_dir / "01_text" / "project_profile.yaml"
        if pp.exists():
            s = pp.read_text(encoding="utf-8")
            s = re.sub(r'program: ".*"', f'program: "{args.program}"', s)
            s = re.sub(r"grad_year: \d+", f"grad_year: {args.year}", s)
            s = re.sub(r'project_title: ".*"', f'project_title: "{title.replace("\"", "'")}"', s)
            s = re.sub(r'project_slug: ".*"', f'project_slug: "{slugify(title)}"', s)
            s = re.sub(r'project_url: ".*"', f'project_url: "{url}"', s)
            s = re.sub(r'date_utc: ".*"', f'date_utc: "{datetime.now(timezone.utc).strftime("%Y-%m-%d")}"', s)
            pp.write_text(s, encoding="utf-8")

        # description
        if extracted.text_md:
            (project_dir / "01_text" / "project_description.md").write_text("# Project Description\n\n" + extracted.text_md + "\n", encoding="utf-8")

        # links
        non_host = [l for l in extracted.outgoing_links if urlparse(l).netloc != host]
        lines = ["# Links", "", "## Official page", f"- {url}", "", "## External (video/press/other)"]
        for l in non_host[:100]:
            lines.append(f"- {l}")
        (project_dir / "01_text" / "links.md").write_text("\n".join(lines) + "\n", encoding="utf-8")

        originals_dir = project_dir / "02_images" / "originals"
        ensure_dir(originals_dir)
        saved_imgs = download_images(extracted.image_urls[:60], originals_dir, args.delay, args.jitter, allow_host=host)

        # checksums
        checksum_lines = []
        for p in sorted(originals_dir.glob("img_*")):
            checksum_lines.append(f"{sha256_file(p)}  {p.name}")
        (project_dir / "00_admin" / "checksums.sha256").write_text("\n".join(checksum_lines) + ("\n" if checksum_lines else ""), encoding="utf-8")

        cover = saved_imgs[0] if saved_imgs else ""
        with master_csv.open("a", encoding="utf-8", newline="") as f:
            w = csv.writer(f)
            w.writerow([args.program, args.year, title, url, cover, len(saved_imgs), "downloaded", ""])

        done.add(url)
        processed += 1

        state["done"] = sorted(done)
        state["queue"] = queue[:]
        state_path.write_text(json.dumps(state, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")

        polite_sleep(args.delay, args.jitter)


if __name__ == "__main__":
    main()
