#!/usr/bin/env python3
"""Download NPM curation (theme.npm.edu.tw) public preview by id.

This site currently fails Python TLS verification in this environment.
We use curl for HTTPS fetches (still polite delay).

Outputs:
- meta/curationDataForPublic.json
- article.md (text reconstruction)
- images/*.jpg (cover + all collectionId images)

Usage:
  npm_curation_dl.py --id 112 --out /path --delay 1.2 --jitter 1.0
"""

import argparse, json, os, random, re, subprocess, time
from pathlib import Path
from urllib.parse import urljoin

UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36"

QUERY_TPL = """
query{
  curationDataForPublic(curationId:%(id)s)
  {
    coverId
    curationId
    curationName
    curator
    textAlign
    description
    memberRole
    team
    blockList
    {
      blockId
      blockType
      subject
      description
      isReverse
      colsPerRow
      itemList {
        itemId
        collectionId
        subject
        description
        textAlign
        x
        y
        scale
      }
    }
  }
}
""".strip()


def sleep_polite(base: float, jitter: float):
    time.sleep(base + random.uniform(0, jitter))


def curl_json(url: str, payload: dict, timeout: int = 60) -> dict:
    data = json.dumps(payload, ensure_ascii=False)
    out = subprocess.check_output(
        [
            "curl",
            "-sS",
            "-L",
            "--max-time",
            str(timeout),
            "-H",
            f"User-Agent: {UA}",
            "-H",
            "Accept: application/json",
            "-H",
            "Content-Type: application/json",
            "--data-binary",
            data,
            url,
        ]
    )
    return json.loads(out.decode("utf-8", errors="ignore"))


def curl_bytes(url: str, out_path: Path, timeout: int = 120) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    subprocess.check_call(
        [
            "curl",
            "-sS",
            "-L",
            "--max-time",
            str(timeout),
            "-H",
            f"User-Agent: {UA}",
            "-o",
            str(out_path),
            url,
        ]
    )


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--id", type=int, required=True)
    ap.add_argument("--out", required=True)
    ap.add_argument("--base", default="https://theme.npm.edu.tw")
    ap.add_argument("--width", type=int, default=2400)
    ap.add_argument("--delay", type=float, default=1.0)
    ap.add_argument("--jitter", type=float, default=0.8)
    args = ap.parse_args()

    out = Path(args.out)
    (out / "images").mkdir(parents=True, exist_ok=True)
    (out / "meta").mkdir(parents=True, exist_ok=True)

    gql = urljoin(args.base, "/CurationApi/graphql")
    payload = {"query": QUERY_TPL % {"id": args.id}}

    data = curl_json(gql, payload)
    (out / "meta" / "curationDataForPublic.json").write_text(
        json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
    )

    c = data.get("data", {}).get("curationDataForPublic")
    if not c:
        raise SystemExit("No curationDataForPublic in response")

    title = c.get("curationName") or f"curation_{args.id}"
    curator = c.get("curator") or ""
    desc = c.get("description") or ""

    # Cover
    cover_id = c.get("coverId")
    if cover_id:
        cover_url = urljoin(args.base, f"/CurationApi/pic?id={cover_id}&w={args.width}&b=1")
        cover_path = out / "images" / f"cover_{cover_id}.jpg"
        if not cover_path.exists():
            curl_bytes(cover_url, cover_path)
            sleep_polite(args.delay, args.jitter)

    seen = set()
    blocks_md = []
    for b in c.get("blockList") or []:
        btype = b.get("blockType")
        blocks_md.append(f"## Block {b.get('blockId')} (type {btype})")

        bsubject = (b.get("subject") or "").strip()
        bdesc = (b.get("description") or "").strip()
        if bsubject:
            blocks_md.append(f"**Subject:** {bsubject}")
        if bdesc:
            blocks_md.append(bdesc)

        for it in b.get("itemList") or []:
            cid = it.get("collectionId")
            if cid and cid not in seen:
                seen.add(cid)
                img_url = urljoin(args.base, f"/CurationApi/pic?id={cid}&w={args.width}&b=1")
                img_path = out / "images" / f"img_{cid}.jpg"
                if not img_path.exists():
                    try:
                        curl_bytes(img_url, img_path)
                    except Exception:
                        pass
                    sleep_polite(args.delay, args.jitter)

            itsub = (it.get("subject") or "").strip()
            itdesc = (it.get("description") or "").strip()
            if itsub or itdesc or cid:
                blocks_md.append(f"- Item {it.get('itemId')} | collectionId={cid}")
                if itsub:
                    blocks_md.append(f"  - {itsub}")
                if itdesc:
                    blocks_md.append(f"  - {itdesc}")

        blocks_md.append("")

    md = [
        f"# {title}",
        "",
        f"Curator: {curator}",
        "",
        desc,
        "",
        f"Source: {args.base}/curation/preview_public/?id={args.id}",
        "",
        "---",
        "",
    ]
    md += blocks_md
    (out / "article.md").write_text("\n".join(md) + "\n", encoding="utf-8")

    print(f"OK: {out}")
    print(f"images: {len(list((out/'images').glob('*.jpg')))}")


if __name__ == "__main__":
    main()
