#!/usr/bin/env python3
"""Polite, resumable downloader for 2025.rca.ac.uk program pages and linked project pages.

- Respects robots Crawl-delay (30s) with jitter.
- Uses WP sitemaps to find URLs, then filters by keyword.
- Downloads HTML + images + extracts basic text into template.

This is intentionally conservative to avoid triggering anti-bot.
"""

import argparse
import csv
import hashlib
import json
import os
import random
import re
import time
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable, List, Optional, Tuple
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
from urllib import robotparser

DEFAULT_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36"

SITEMAPS = [
    "https://2025.rca.ac.uk/wp-sitemap-posts-post-1.xml",
    "https://2025.rca.ac.uk/wp-sitemap-posts-page-1.xml",
    "https://2025.rca.ac.uk/wp-sitemap-posts-rca-profiles-1.xml",
]


def slugify(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9]+", "-", s)
    s = re.sub(r"-+", "-", s).strip("-")
    return s or "untitled"


def sha256_file(p: Path) -> str:
    h = hashlib.sha256()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)


def polite_sleep(base: float, jitter: float) -> None:
    t = base + random.uniform(0, jitter)
    time.sleep(t)


def load_robot_parser(robots_url: str, session: requests.Session) -> robotparser.RobotFileParser:
    """Fetch robots.txt via our requests session (UA, retries) and parse."""
    rp = robotparser.RobotFileParser()
    try:
        r = session.get(robots_url, timeout=30)
        r.raise_for_status()
        rp.parse(r.text.splitlines())
    except Exception:
        # fallback: allow all if robots cannot be fetched (still keep polite delay)
        rp.parse(["User-agent: *", "Disallow:"])
    return rp


def fetch(session: requests.Session, url: str, out: Optional[Path] = None, timeout: int = 30) -> bytes:
    r = session.get(url, timeout=timeout)
    r.raise_for_status()
    data = r.content
    if out is not None:
        ensure_dir(out.parent)
        out.write_bytes(data)
    return data


def parse_sitemap_urls(xml_bytes: bytes) -> List[str]:
    soup = BeautifulSoup(xml_bytes, "xml")
    urls = []
    for loc in soup.find_all("loc"):
        if loc.text:
            urls.append(loc.text.strip())
    return urls


def choose_best_src(img_tag) -> Optional[str]:
    # Prefer srcset highest width
    srcset = img_tag.get("srcset")
    if srcset:
        candidates = []
        for part in srcset.split(","):
            part = part.strip()
            if not part:
                continue
            bits = part.split()
            if len(bits) == 1:
                candidates.append((0, bits[0]))
            else:
                url = bits[0]
                size = bits[1]
                m = re.match(r"(\d+)(w|x)", size)
                w = int(m.group(1)) if m else 0
                candidates.append((w, url))
        candidates.sort(key=lambda x: x[0], reverse=True)
        return candidates[0][1] if candidates else None
    return img_tag.get("src")


@dataclass
class Extracted:
    title: str
    text_md: str
    image_urls: List[str]
    outgoing_links: List[str]


def extract_page(html: bytes, base_url: str) -> Extracted:
    soup = BeautifulSoup(html, "html.parser")

    # Title
    title = ""
    h1 = soup.find("h1")
    if h1 and h1.get_text(strip=True):
        title = h1.get_text(" ", strip=True)
    if not title and soup.title:
        title = soup.title.get_text(" ", strip=True)

    # Main content heuristic: prefer <main>, else body
    main = soup.find("main") or soup.body or soup

    # Gather paragraphs/headings (keep modest)
    parts = []
    for el in main.find_all(["h1", "h2", "h3", "p", "li"], recursive=True):
        txt = el.get_text(" ", strip=True)
        if not txt:
            continue
        if len(txt) < 2:
            continue
        # Skip nav-ish clutter
        if txt.lower() in {"skip to content"}:
            continue
        parts.append(txt)

    # Deduplicate adjacent repeats
    cleaned = []
    prev = None
    for t in parts:
        if t == prev:
            continue
        cleaned.append(t)
        prev = t

    text_md = "\n\n".join(cleaned[:300])  # cap

    # Images
    image_urls = []
    for img in main.find_all("img"):
        u = choose_best_src(img)
        if not u:
            continue
        u = urljoin(base_url, u)
        # Keep only rca domain assets
        image_urls.append(u)

    # Links
    outgoing = []
    for a in main.find_all("a"):
        href = a.get("href")
        if not href:
            continue
        href = urljoin(base_url, href)
        outgoing.append(href)

    # Uniq preserve order
    def uniq(seq: Iterable[str]) -> List[str]:
        seen = set()
        out = []
        for x in seq:
            if x in seen:
                continue
            seen.add(x)
            out.append(x)
        return out

    return Extracted(title=title.strip(), text_md=text_md.strip(), image_urls=uniq(image_urls), outgoing_links=uniq(outgoing))


def copy_template(template_dir: Path, dest_dir: Path) -> None:
    # Minimal copy without shutil.copytree for safety
    import shutil

    if dest_dir.exists():
        return
    shutil.copytree(template_dir, dest_dir)


def write_project_files(project_dir: Path, url: str, extracted: Extracted, program: str, year: int, captured_by: str = "openclaw") -> None:
    # 00_admin
    (project_dir / "00_admin" / "source_url.txt").write_text(url + "\n", encoding="utf-8")
    meta = {
        "captured_at_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
        "captured_by": captured_by,
        "user_agent": DEFAULT_UA,
        "notes": "polite-crawl: robots crawl-delay respected",
    }
    (project_dir / "00_admin" / "capture_meta.json").write_text(json.dumps(meta, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")

    # 01_text
    # Fill YAML lightly (string replace markers)
    pp = project_dir / "01_text" / "project_profile.yaml"
    s = pp.read_text(encoding="utf-8")
    s = re.sub(r'program: ".*"', f'program: "{program}"', s)
    s = re.sub(r"grad_year: \d+", f"grad_year: {year}", s)
    s = re.sub(r'project_title: ".*"', f'project_title: "{extracted.title.replace("\"", "'")}"', s)
    s = re.sub(r'project_slug: ".*"', f'project_slug: "{slugify(extracted.title)}"', s)
    s = re.sub(r'project_url: ".*"', f'project_url: "{url}"', s)
    s = re.sub(r'date_utc: ".*"', f'date_utc: "{datetime.now(timezone.utc).strftime("%Y-%m-%d")}"', s)
    pp.write_text(s, encoding="utf-8")

    desc = project_dir / "01_text" / "project_description.md"
    if extracted.text_md:
        desc.write_text("# Project Description\n\n" + extracted.text_md + "\n", encoding="utf-8")

    links_md = project_dir / "01_text" / "links.md"
    # keep non-rca links (video/press)
    non_rca = [l for l in extracted.outgoing_links if "2025.rca.ac.uk" not in l]
    lines = ["# Links", "", "## Official project page", f"- {url}", "", "## External (video/press/other)"]
    for l in non_rca[:100]:
        lines.append(f"- {l}")
    links_md.write_text("\n".join(lines) + "\n", encoding="utf-8")


def download_images(session: requests.Session, rp: robotparser.RobotFileParser, page_url: str, image_urls: List[str], originals_dir: Path, delay_s: float, jitter_s: float, user_agent: str) -> List[str]:
    saved = []
    for idx, u in enumerate(image_urls, start=1):
        if not rp.can_fetch(user_agent, u):
            continue
        ext = os.path.splitext(urlparse(u).path)[1].lower()
        if ext not in {".jpg", ".jpeg", ".png", ".webp", ".gif"}:
            ext = ".jpg"
        out = originals_dir / f"img_{idx:03d}{ext}"
        if out.exists():
            saved.append(str(out))
            continue
        try:
            fetch(session, u, out=out)
            saved.append(str(out))
        except Exception:
            # skip broken
            continue
        polite_sleep(delay_s, jitter_s)
    return saved


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--start-url", required=True)
    ap.add_argument("--program", required=True)
    ap.add_argument("--year", type=int, required=True)
    ap.add_argument("--out-root", default=str(Path.home() / "Desktop" / "RCA_MediaLibrary"))
    ap.add_argument("--template", default=str(Path.home() / "Desktop" / "RCA_MediaLibrary" / "_TEMPLATE" / "project_package"))
    ap.add_argument("--state", default=str(Path.home() / "Desktop" / "RCA_MediaLibrary" / "_state_2025.json"))
    ap.add_argument("--delay", type=float, default=30.0)  # robots crawl-delay
    ap.add_argument("--jitter", type=float, default=8.0)
    ap.add_argument("--max-pages", type=int, default=999999)
    args = ap.parse_args()

    out_root = Path(args.out_root)
    template_dir = Path(args.template)
    state_path = Path(args.state)

    ensure_dir(out_root)

    user_agent = DEFAULT_UA
    session = requests.Session()
    session.headers.update({"User-Agent": user_agent})

    rp = load_robot_parser("https://2025.rca.ac.uk/robots.txt", session)

    # Load state
    state = {"done": [], "queue": [], "errors": []}
    if state_path.exists():
        try:
            state = json.loads(state_path.read_text(encoding="utf-8"))
        except Exception:
            pass

    done = set(state.get("done", []))
    queue = state.get("queue", [])

    if not queue:
        # seed from sitemaps + start-url keyword
        urls = set([args.start_url])
        for sm in SITEMAPS:
            if not rp.can_fetch(user_agent, sm):
                continue
            try:
                xml = fetch(session, sm)
                for u in parse_sitemap_urls(xml):
                    urls.add(u)
                polite_sleep(args.delay, args.jitter)
            except Exception as e:
                state.setdefault("errors", []).append({"url": sm, "err": str(e)})

        # filter: keep urls under start-url path prefix OR containing its slug
        start_path = urlparse(args.start_url).path.rstrip("/")
        keyword = start_path.split("/")[-1]
        filtered = []
        for u in sorted(urls):
            p = urlparse(u).path
            if p.startswith(start_path) or keyword in p:
                filtered.append(u)
        # Put start-url first
        filtered = [args.start_url] + [u for u in filtered if u != args.start_url]
        queue = filtered

    # Prepare output program/year dirs
    program_slug = slugify(args.program)
    base_dir = out_root / program_slug / str(args.year)
    ensure_dir(base_dir / "00_index")
    ensure_dir(base_dir / "artists")
    ensure_dir(base_dir / "projects")

    master_csv = base_dir / "00_index" / "projects_master.csv"
    if not master_csv.exists():
        master_csv.write_text(
            "program,grad_year,artist_name,project_title,project_url,cover_image_path,image_count,video_links,status,notes\n",
            encoding="utf-8",
        )

    processed = 0
    while queue and processed < args.max_pages:
        url = queue.pop(0)
        if url in done:
            continue
        if not rp.can_fetch(user_agent, url):
            done.add(url)
            continue

        try:
            html = fetch(session, url)
        except Exception as e:
            state.setdefault("errors", []).append({"url": url, "err": str(e)})
            # backoff a bit
            polite_sleep(args.delay * 1.5, args.jitter)
            continue

        extracted = extract_page(html, url)

        title = extracted.title or url
        project_slug = slugify(title)[:80]

        project_dir = base_dir / "projects" / f"{project_slug}__unknown__rca__{args.year}"
        copy_template(template_dir, project_dir)

        # Save raw html
        raw_dir = project_dir / "00_source_page"
        ensure_dir(raw_dir)
        (raw_dir / "page.html").write_bytes(html)

        write_project_files(project_dir, url, extracted, args.program, args.year)

        originals_dir = project_dir / "02_images" / "originals"
        ensure_dir(originals_dir)
        saved_imgs = download_images(session, rp, url, extracted.image_urls[:50], originals_dir, args.delay, args.jitter, user_agent)

        # checksums
        checksum_lines = []
        for p in sorted(originals_dir.glob("img_*")):
            checksum_lines.append(f"{sha256_file(p)}  {p.name}")
        (project_dir / "00_admin" / "checksums.sha256").write_text("\n".join(checksum_lines) + ("\n" if checksum_lines else ""), encoding="utf-8")

        cover = saved_imgs[0] if saved_imgs else ""

        # Append to master csv
        with master_csv.open("a", encoding="utf-8", newline="") as f:
            w = csv.writer(f)
            w.writerow([args.program, args.year, "", title, url, cover, len(saved_imgs), "", "downloaded", ""])

        # Add a few internal links to queue (within domain) to expand gently
        for l in extracted.outgoing_links:
            if "2025.rca.ac.uk" not in l:
                continue
            # stay in same program vicinity
            if keyword in urlparse(l).path and l not in done and l not in queue:
                queue.append(l)

        done.add(url)
        processed += 1

        # Persist state each page
        state["done"] = sorted(done)
        state["queue"] = queue[:]
        state_path.write_text(json.dumps(state, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")

        polite_sleep(args.delay, args.jitter)


if __name__ == "__main__":
    main()
