#!/usr/bin/env python3
"""Downloader for ualshowcase.arts.ac.uk.

robots.txt:
- Crawl-delay: 10
- Disallow: /projects/

sitemap.xml contains /project/{id}/cover and author pages.
This script queues only /project/* URLs to match "work" unit.

Output uses the shared project_package template.
"""

import argparse
import csv
import hashlib
import json
import os
import random
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import List
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36"


def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)


def slugify(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9]+", "-", s)
    s = re.sub(r"-+", "-", s).strip("-")
    return s or "untitled"


def polite_sleep(delay: float, jitter: float):
    time.sleep(delay + random.uniform(0, jitter))


def fetch(session: requests.Session, url: str, out: Path | None = None, timeout: int = 40) -> bytes:
    r = session.get(url, timeout=timeout)
    r.raise_for_status()
    data = r.content
    if out:
        ensure_dir(out.parent)
        out.write_bytes(data)
    return data


def parse_sitemap(xml_bytes: bytes) -> List[str]:
    soup = BeautifulSoup(xml_bytes, "xml")
    return [loc.text.strip() for loc in soup.find_all("loc") if loc.text]


def choose_best_src(img_tag):
    srcset = img_tag.get("srcset")
    if srcset:
        cands=[]
        for part in srcset.split(','):
            part=part.strip()
            if not part: continue
            bits=part.split()
            u=bits[0]
            w=0
            if len(bits)>1:
                m=re.match(r"(\d+)(w|x)", bits[1])
                if m: w=int(m.group(1))
            cands.append((w,u))
        cands.sort(key=lambda x:x[0], reverse=True)
        return cands[0][1] if cands else None
    return img_tag.get('src')


def extract(html: bytes, base_url: str):
    soup = BeautifulSoup(html, "html.parser")
    title = ""
    h1 = soup.find("h1")
    if h1 and h1.get_text(strip=True):
        title = h1.get_text(" ", strip=True)
    if not title and soup.title:
        title = soup.title.get_text(" ", strip=True)

    main = soup.find("main") or soup.body or soup

    parts=[]
    for el in main.find_all(["h1","h2","h3","p","li"], recursive=True):
        t=el.get_text(" ", strip=True)
        if t and len(t)>1:
            parts.append(t)
    text='\n\n'.join(parts[:400]).strip()

    imgs=[]
    for img in main.find_all('img'):
        u=choose_best_src(img)
        if not u: continue
        imgs.append(urljoin(base_url,u))

    links=[]
    for a in main.find_all('a'):
        href=a.get('href')
        if not href: continue
        links.append(urljoin(base_url, href))

    def uniq(xs):
        seen=set(); out=[]
        for x in xs:
            if x in seen: continue
            seen.add(x); out.append(x)
        return out

    return title.strip(), text, uniq(imgs), uniq(links)


def sha256_file(p: Path) -> str:
    h=hashlib.sha256()
    with p.open('rb') as f:
        for chunk in iter(lambda: f.read(1024*1024), b''):
            h.update(chunk)
    return h.hexdigest()


def copy_template(template_dir: Path, dest_dir: Path):
    import shutil
    if dest_dir.exists():
        return
    shutil.copytree(template_dir, dest_dir)


def download_images(session: requests.Session, imgs: List[str], originals: Path, delay: float, jitter: float, host: str):
    saved=[]
    for i,u in enumerate(imgs, start=1):
        if urlparse(u).netloc != host:
            continue
        ext=os.path.splitext(urlparse(u).path)[1].lower()
        if ext not in {'.jpg','.jpeg','.png','.webp','.gif'}:
            ext='.jpg'
        out=originals / f"img_{i:03d}{ext}"
        if out.exists():
            saved.append(str(out));
            continue
        try:
            fetch(session, u, out=out, timeout=80)
            saved.append(str(out))
        except Exception:
            continue
        polite_sleep(delay, jitter)
    return saved


def main():
    ap=argparse.ArgumentParser()
    ap.add_argument('--year', type=int, required=True)
    ap.add_argument('--out-root', required=True)
    ap.add_argument('--template', required=True)
    ap.add_argument('--state', required=True)
    ap.add_argument('--delay', type=float, default=10.0)
    ap.add_argument('--jitter', type=float, default=4.0)
    ap.add_argument('--max-pages', type=int, default=2000)
    args=ap.parse_args()

    base='https://ualshowcase.arts.ac.uk/'
    host=urlparse(base).netloc

    out_root=Path(args.out_root)
    template_dir=Path(args.template)
    state_path=Path(args.state)

    program_slug='ual-showcase'
    base_dir=out_root/program_slug/str(args.year)
    ensure_dir(base_dir/'00_index')
    ensure_dir(base_dir/'projects')

    master_csv=base_dir/'00_index'/'projects_master.csv'
    if not master_csv.exists():
        master_csv.write_text('program,grad_year,project_title,project_url,cover_image_path,image_count,status,notes\n', encoding='utf-8')

    session=requests.Session()
    session.headers.update({'User-Agent': UA})

    state={'done':[], 'queue':[], 'errors':[]}
    if state_path.exists():
        try:
            state=json.loads(state_path.read_text(encoding='utf-8'))
        except Exception:
            pass
    done=set(state.get('done',[]))
    queue=state.get('queue',[])

    if not queue:
        xml=fetch(session, urljoin(base,'sitemap.xml'))
        urls=parse_sitemap(xml)
        # only project pages
        proj=[u for u in urls if '/project/' in urlparse(u).path]
        queue=proj

    processed=0
    while queue and processed<args.max_pages:
        url=queue.pop(0)
        if url in done: continue

        try:
            html=fetch(session, url)
        except Exception as e:
            state.setdefault('errors',[]).append({'url':url,'err':str(e)})
            polite_sleep(args.delay*1.5, args.jitter)
            continue

        title,text,imgs,links=extract(html, url)
        slug=slugify(title or urlparse(url).path.split('/')[-1])[:80]
        proj_dir=base_dir/'projects'/f"{slug}__unknown__ual__{args.year}"
        copy_template(template_dir, proj_dir)

        raw_dir=proj_dir/'00_source_page'
        ensure_dir(raw_dir)
        (raw_dir/'page.html').write_bytes(html)
        (proj_dir/'00_admin'/'source_url.txt').write_text(url+'\n', encoding='utf-8')
        meta={
            'captured_at_utc': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
            'captured_by':'openclaw',
            'user_agent': UA,
            'notes':'robots crawl-delay=10 respected'
        }
        (proj_dir/'00_admin'/'capture_meta.json').write_text(json.dumps(meta,ensure_ascii=False,indent=2)+'\n', encoding='utf-8')

        if text:
            (proj_dir/'01_text'/'project_description.md').write_text('# Project Description\n\n'+text+'\n', encoding='utf-8')

        ext=[l for l in links if urlparse(l).netloc!=host]
        md='\n'.join(['# Links','', '## Official page', f'- {url}','', '## External'] + [f'- {l}' for l in ext[:200]])
        (proj_dir/'01_text'/'links.md').write_text(md+'\n', encoding='utf-8')

        originals=proj_dir/'02_images'/'originals'
        ensure_dir(originals)
        saved=download_images(session, imgs[:80], originals, args.delay, args.jitter, host)

        # checksums
        lines=[]
        for p in sorted(originals.glob('img_*')):
            lines.append(f"{sha256_file(p)}  {p.name}")
        (proj_dir/'00_admin'/'checksums.sha256').write_text('\n'.join(lines)+( '\n' if lines else ''), encoding='utf-8')

        cover=saved[0] if saved else ''
        with master_csv.open('a', encoding='utf-8', newline='') as f:
            csv.writer(f).writerow(['UAL Showcase', args.year, title, url, cover, len(saved), 'downloaded', ''])

        done.add(url)
        processed += 1
        state['done']=sorted(done)
        state['queue']=queue
        state_path.write_text(json.dumps(state,ensure_ascii=False,indent=2)+'\n', encoding='utf-8')

        polite_sleep(args.delay, args.jitter)


if __name__=='__main__':
    main()
