#!/usr/bin/env python3
"""Downloader for compartsblog.doc.gold.ac.uk final projects pages.

Approach:
- Fetch one "final-projects-YYYY-class" page.
- Extract /index.php/work/... links as individual works.
- Download each work page into project package, plus images.

Polite delay default 10s (no robots.txt available; stay conservative).
"""

import argparse, csv, json, os, random, re, time, subprocess, hashlib
from pathlib import Path
from datetime import datetime, timezone
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup

UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36"


def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)


def slugify(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9]+", "-", s)
    s = re.sub(r"-+", "-", s).strip("-")
    return s or "untitled"


def curl_fetch(url: str, out: Path | None = None, timeout: int = 60) -> bytes:
    data = subprocess.check_output([
        "curl","-sS","-L","--max-time",str(timeout),
        "-H",f"User-Agent: {UA}", url
    ])
    if out:
        ensure_dir(out.parent)
        out.write_bytes(data)
    return data


def pick_best_img(img) -> str | None:
    srcset = img.get("srcset")
    if srcset:
        cands=[]
        for part in srcset.split(','):
            part=part.strip()
            if not part: continue
            bits=part.split()
            u=bits[0]
            w=0
            if len(bits)>1:
                m=re.match(r"(\d+)(w|x)", bits[1])
                if m: w=int(m.group(1))
            cands.append((w,u))
        cands.sort(key=lambda x:x[0], reverse=True)
        return cands[0][1] if cands else None
    return img.get('src')


def extract_work(html: bytes, base_url: str):
    soup=BeautifulSoup(html,'html.parser')
    title=soup.title.get_text(' ',strip=True) if soup.title else ''
    h1=soup.find('h1')
    if h1 and h1.get_text(strip=True):
        title=h1.get_text(' ',strip=True)

    main=soup.find('main') or soup.body or soup

    parts=[]
    for el in main.find_all(['h1','h2','h3','p','li'], recursive=True):
        t=el.get_text(' ',strip=True)
        if t and len(t)>1:
            parts.append(t)
    text='\n\n'.join(parts[:400]).strip()

    imgs=[]
    for img in main.find_all('img'):
        u=pick_best_img(img)
        if not u: continue
        imgs.append(urljoin(base_url,u))

    links=[]
    for a in main.find_all('a'):
        href=a.get('href')
        if not href: continue
        links.append(urljoin(base_url, href))

    def uniq(xs):
        seen=set(); out=[]
        for x in xs:
            if x in seen: continue
            seen.add(x); out.append(x)
        return out

    return title.strip(), text, uniq(imgs), uniq(links)


def sha256_file(p: Path) -> str:
    h=hashlib.sha256()
    with p.open('rb') as f:
        for chunk in iter(lambda: f.read(1024*1024), b''):
            h.update(chunk)
    return h.hexdigest()


def copy_template(template_dir: Path, dest_dir: Path):
    import shutil
    if dest_dir.exists():
        return
    shutil.copytree(template_dir, dest_dir)


def download_images(img_urls, originals_dir: Path, delay: float, jitter: float, allow_host: str):
    saved=[]
    for i,u in enumerate(img_urls, start=1):
        if urlparse(u).netloc != allow_host:
            continue
        ext=os.path.splitext(urlparse(u).path)[1].lower()
        if ext not in {'.jpg','.jpeg','.png','.webp','.gif'}:
            ext='.jpg'
        out=originals_dir/f"img_{i:03d}{ext}"
        if out.exists():
            saved.append(str(out));
            continue
        try:
            curl_fetch(u, out=out, timeout=120)
            saved.append(str(out))
        except Exception:
            continue
        time.sleep(delay + random.uniform(0,jitter))
    return saved


def main():
    ap=argparse.ArgumentParser()
    ap.add_argument('--class-url', required=True, help='e.g. https://compartsblog.../final-projects-2019-class/')
    ap.add_argument('--program', default='Goldsmiths Computational Arts (blog)')
    ap.add_argument('--year', type=int, required=True)
    ap.add_argument('--out-root', default=str(Path.home()/ 'Desktop' / 'Goldsmiths_MediaLibrary'))
    ap.add_argument('--template', default=str(Path.home()/'Desktop'/'RCA_MediaLibrary'/'_TEMPLATE'/'project_package'))
    ap.add_argument('--state', default=str(Path.home()/'Desktop'/'Goldsmiths_MediaLibrary'/'_state_compartsblog.json'))
    ap.add_argument('--delay', type=float, default=10.0)
    ap.add_argument('--jitter', type=float, default=4.0)
    ap.add_argument('--max-pages', type=int, default=300)
    args=ap.parse_args()

    host=urlparse(args.class_url).netloc

    out_root=Path(args.out_root)
    template_dir=Path(args.template)
    state_path=Path(args.state)

    program_slug=slugify(args.program)
    base_dir=out_root/program_slug/str(args.year)
    ensure_dir(base_dir/'00_index')
    ensure_dir(base_dir/'projects')

    master_csv=base_dir/'00_index'/'projects_master.csv'
    if not master_csv.exists():
        master_csv.write_text('program,grad_year,project_title,project_url,cover_image_path,image_count,status,notes\n', encoding='utf-8')

    state={'done':[], 'queue':[], 'errors':[]}
    if state_path.exists():
        try:
            state=json.loads(state_path.read_text(encoding='utf-8'))
        except Exception:
            pass
    done=set(state.get('done',[]))
    queue=state.get('queue',[])

    if not queue:
        html=curl_fetch(args.class_url)
        soup=BeautifulSoup(html,'html.parser')
        links=[]
        for a in soup.find_all('a'):
            href=a.get('href')
            if not href: continue
            u=urljoin(args.class_url, href)
            p=urlparse(u)
            if p.netloc==host and '/index.php/work/' in p.path:
                links.append(u)
        # uniq
        seen=set(); queue=[]
        for u in links:
            if u in seen: continue
            seen.add(u); queue.append(u)

    processed=0
    while queue and processed<args.max_pages:
        url=queue.pop(0)
        if url in done: continue
        try:
            html=curl_fetch(url)
        except Exception as e:
            state.setdefault('errors',[]).append({'url':url,'err':str(e)})
            time.sleep(args.delay*2)
            continue

        title,text,imgs,links=extract_work(html, url)
        slug=slugify(title)[:80]
        proj_dir=base_dir/'projects'/f"{slug}__unknown__compartsblog__{args.year}"
        copy_template(template_dir, proj_dir)

        raw_dir=proj_dir/'00_source_page'
        ensure_dir(raw_dir)
        (raw_dir/'page.html').write_bytes(html)
        (proj_dir/'00_admin'/'source_url.txt').write_text(url+'\n', encoding='utf-8')
        meta={
            'captured_at_utc': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
            'captured_by':'openclaw',
            'user_agent': UA,
            'notes':'robots.txt not found (404); conservative delay applied'
        }
        (proj_dir/'00_admin'/'capture_meta.json').write_text(json.dumps(meta,ensure_ascii=False,indent=2)+'\n', encoding='utf-8')

        if text:
            (proj_dir/'01_text'/'project_description.md').write_text('# Project Description\n\n'+text+'\n', encoding='utf-8')

        ext=[l for l in links if urlparse(l).netloc!=host]
        md='\n'.join(['# Links','', '## Official page', f'- {url}','', '## External'] + [f'- {l}' for l in ext[:200]])
        (proj_dir/'01_text'/'links.md').write_text(md+'\n', encoding='utf-8')

        originals=proj_dir/'02_images'/'originals'
        ensure_dir(originals)
        saved=download_images(imgs[:80], originals, args.delay, args.jitter, host)

        # checksums
        lines=[]
        for p in sorted(originals.glob('img_*')):
            lines.append(f"{sha256_file(p)}  {p.name}")
        (proj_dir/'00_admin'/'checksums.sha256').write_text('\n'.join(lines)+( '\n' if lines else ''), encoding='utf-8')

        cover=saved[0] if saved else ''
        with master_csv.open('a', encoding='utf-8', newline='') as f:
            csv.writer(f).writerow([args.program, args.year, title, url, cover, len(saved), 'downloaded', ''])

        done.add(url)
        processed += 1

        state['done']=sorted(done)
        state['queue']=queue
        state_path.write_text(json.dumps(state,ensure_ascii=False,indent=2)+'\n', encoding='utf-8')

        time.sleep(args.delay + random.uniform(0,args.jitter))


if __name__=='__main__':
    main()
