#!/usr/bin/env python3
import argparse
import hashlib
import io
import json
import os
import re
import unicodedata
import zipfile
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple
from urllib.parse import quote


def now_iso() -> str:
    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")


def normalize_text(value: str) -> str:
    s = unicodedata.normalize("NFKD", value)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = (
        s.replace("Ö", "O")
        .replace("ö", "o")
        .replace("Ä", "A")
        .replace("ä", "a")
        .replace("Ü", "U")
        .replace("ü", "u")
        .replace("ß", "ss")
    )
    s = s.replace("INQOVI", "INAQOVI").replace("960720", "960 720")
    s = re.sub(r"\((?:\d+)\)$", "", s.strip())
    s = s.lower()
    s = re.sub(r"[^a-z0-9]+", " ", s).strip()
    return s


def tokenize(value: str) -> List[str]:
    return [t for t in normalize_text(value).split() if t]


def safe_part(value: str) -> str:
    return re.sub(r"[^a-zA-Z0-9._-]", "_", value)


def encode_pdf_meta_asset(pdf_rel_path: str, page: int = 1) -> str:
    p = max(1, int(page))
    return f"pdfmeta|{p}|{quote(pdf_rel_path, safe='')}|"


@dataclass
class PdfEntry:
    source_path: str
    display_name: str
    bytes_data: bytes
    size: int


@dataclass
class MatchRow:
    pdf_name: str
    best_doc: str
    score: float
    seq_ratio: float
    token_ratio: float
    margin: float
    source_path: str
    size: int


def collect_pdfs_from_zip_bytes(data: bytes, prefix: str, out: List[PdfEntry]) -> None:
    try:
        zf = zipfile.ZipFile(io.BytesIO(data))
    except Exception:
        return

    with zf as z:
        for info in z.infolist():
            if info.is_dir():
                continue
            name = info.filename
            low = name.lower()
            src = f"{prefix}/{name}"

            if low.endswith(".zip"):
                try:
                    nested = z.read(info)
                except Exception:
                    continue
                collect_pdfs_from_zip_bytes(nested, src, out)
                continue

            if low.endswith(".pdf"):
                try:
                    payload = z.read(info)
                except Exception:
                    continue
                out.append(
                    PdfEntry(
                        source_path=src,
                        display_name=os.path.basename(name),
                        bytes_data=payload,
                        size=info.file_size,
                    )
                )


def collect_pdfs_from_outer_zip(path: str) -> List[PdfEntry]:
    out: List[PdfEntry] = []
    with zipfile.ZipFile(path) as z:
        for info in z.infolist():
            if info.is_dir():
                continue
            name = info.filename
            low = name.lower()
            if low.endswith(".zip"):
                try:
                    payload = z.read(info)
                except Exception:
                    continue
                collect_pdfs_from_zip_bytes(payload, name, out)
            elif low.endswith(".pdf"):
                try:
                    payload = z.read(info)
                except Exception:
                    continue
                out.append(
                    PdfEntry(
                        source_path=name,
                        display_name=os.path.basename(name),
                        bytes_data=payload,
                        size=info.file_size,
                    )
                )
    return out


def dedupe_pdfs(entries: List[PdfEntry]) -> List[PdfEntry]:
    # Keep best entry per basename; prefer largest payload.
    best: Dict[str, PdfEntry] = {}
    for row in entries:
        key = row.display_name
        if key not in best or row.size > best[key].size:
            best[key] = row
    return list(best.values())


def compute_matches(pdf_entries: List[PdfEntry], docs: List[str]) -> List[MatchRow]:
    import difflib

    doc_norm: List[Tuple[str, str, set]] = []
    for d in docs:
        dn = normalize_text(d)
        dt = set(tokenize(d))
        doc_norm.append((d, dn, dt))

    rows: List[MatchRow] = []
    for entry in sorted(pdf_entries, key=lambda x: x.display_name.lower()):
        pdf_base = os.path.splitext(entry.display_name)[0]
        pn = normalize_text(pdf_base)
        pt = set(pn.split())

        best = None
        second = None
        for doc, dn, dt in doc_norm:
            seq = difflib.SequenceMatcher(None, pn, dn).ratio()
            if pt and dt:
                token_ratio = (2.0 * len(pt & dt)) / (len(pt) + len(dt))
            else:
                token_ratio = 0.0
            score = max(seq, token_ratio)
            candidate = (score, seq, token_ratio, doc)
            if best is None or score > best[0]:
                second = best
                best = candidate
            elif second is None or score > second[0]:
                second = candidate

        if best is None:
            continue

        margin = best[0] - (second[0] if second is not None else 0.0)
        rows.append(
            MatchRow(
                pdf_name=entry.display_name,
                best_doc=best[3],
                score=float(best[0]),
                seq_ratio=float(best[1]),
                token_ratio=float(best[2]),
                margin=float(margin),
                source_path=entry.source_path,
                size=entry.size,
            )
        )

    return rows


def main() -> int:
    parser = argparse.ArgumentParser(description="Bulk import PDF assets from nested ZIP into local metadata DB")
    parser.add_argument("--zip", required=True, help="Path to outer zip")
    parser.add_argument("--db", default="data/local-db.json")
    parser.add_argument("--public-dir", default="public/local-thumbnails")
    parser.add_argument("--min-score", type=float, default=0.82)
    parser.add_argument("--min-margin", type=float, default=0.03)
    parser.add_argument("--apply", action="store_true", help="Write files + update local-db")
    args = parser.parse_args()

    if not os.path.isfile(args.zip):
        raise SystemExit(f"Missing zip: {args.zip}")
    if not os.path.isfile(args.db):
        raise SystemExit(f"Missing db: {args.db}")

    with open(args.db, "r", encoding="utf-8") as f:
        db = json.load(f)

    raw_events = db.get("raw_email_events", [])
    metadata = db.get("email_metadata", [])
    docs = sorted(
        {
            (r.get("approved_document_name") or "").strip()
            for r in raw_events + metadata
            if (r.get("approved_document_name") or "").strip()
        }
    )

    pdf_entries = dedupe_pdfs(collect_pdfs_from_outer_zip(args.zip))
    by_name = {row.display_name: row for row in pdf_entries}

    matches = compute_matches(pdf_entries, docs)

    eligible = [
        m for m in matches if m.score >= args.min_score and m.margin >= args.min_margin and m.best_doc
    ]
    # Keep only best PDF per target document.
    best_for_doc: Dict[str, MatchRow] = {}
    for row in eligible:
        cur = best_for_doc.get(row.best_doc)
        if cur is None or (row.score, row.margin, row.size) > (cur.score, cur.margin, cur.size):
            best_for_doc[row.best_doc] = row

    selected = list(best_for_doc.values())
    selected_docs = {m.best_doc for m in selected}

    metadata_by_doc = {
        (r.get("approved_document_name") or "").strip(): r
        for r in metadata
        if (r.get("approved_document_name") or "").strip()
    }

    will_skip_existing = []
    will_apply = []
    for row in sorted(selected, key=lambda x: (-x.score, x.best_doc.lower())):
        md = metadata_by_doc.get(row.best_doc)
        existing_thumb = ((md or {}).get("thumbnail_path") or "").strip() if md else ""
        if existing_thumb:
            will_skip_existing.append(row)
        else:
            will_apply.append(row)

    low_conf = [
        m
        for m in matches
        if m.best_doc not in selected_docs and (m.score < args.min_score or m.margin < args.min_margin)
    ]

    report = {
        "zip": args.zip,
        "pdf_count_deduped": len(pdf_entries),
        "match_count": len(matches),
        "selected_count": len(selected),
        "will_apply_count": len(will_apply),
        "will_skip_existing_count": len(will_skip_existing),
        "low_confidence_count": len(low_conf),
        "selected": [row.__dict__ for row in sorted(selected, key=lambda x: (-x.score, x.best_doc.lower()))],
        "will_apply": [row.__dict__ for row in sorted(will_apply, key=lambda x: (-x.score, x.best_doc.lower()))],
        "will_skip_existing": [row.__dict__ for row in sorted(will_skip_existing, key=lambda x: (-x.score, x.best_doc.lower()))],
        "low_confidence": [row.__dict__ for row in sorted(low_conf, key=lambda x: (-x.score, x.pdf_name.lower()))],
    }

    report_path = os.path.join("data", "pdf-import-report.json")
    os.makedirs("data", exist_ok=True)
    with open(report_path, "w", encoding="utf-8") as f:
        json.dump(report, f, ensure_ascii=False, indent=2)

    print("=== PDF Import Preview ===")
    print(f"zip: {args.zip}")
    print(f"pdfs (deduped by basename): {len(pdf_entries)}")
    print(f"selected high-confidence matches: {len(selected)}")
    print(f"apply candidates (no existing thumbnail): {len(will_apply)}")
    print(f"skipped (already has thumbnail): {len(will_skip_existing)}")
    print(f"low-confidence/unselected: {len(low_conf)}")
    print(f"report: {report_path}")

    print("\nTop apply candidates:")
    for row in will_apply[:30]:
        print(f"  {row.score:.3f} (m={row.margin:.3f})  {row.pdf_name} -> {row.best_doc}")

    if not args.apply:
        print("\nDry-run only. Re-run with --apply to write files and update local-db.")
        return 0

    os.makedirs(args.public_dir, exist_ok=True)

    changes = 0
    inserted_metadata = 0
    skipped_missing_entry = 0

    for row in will_apply:
        entry = by_name.get(row.pdf_name)
        if entry is None:
            skipped_missing_entry += 1
            continue

        # Ensure metadata row exists
        md = metadata_by_doc.get(row.best_doc)
        if md is None:
            md = {
                "approved_document_name": row.best_doc,
                "friendly_title": None,
                "brand": None,
                "notes": None,
                "thumbnail_path": None,
                "campaign_group": None,
                "updated_at": now_iso(),
            }
            metadata.append(md)
            metadata_by_doc[row.best_doc] = md
            inserted_metadata += 1

        if ((md.get("thumbnail_path") or "").strip()):
            continue

        base = safe_part(row.best_doc)
        file_part = safe_part(row.pdf_name)
        rel = f"local-thumbnails/{base}-{int(datetime.now().timestamp() * 1000)}-{file_part}"
        abs_path = os.path.join("public", rel)
        os.makedirs(os.path.dirname(abs_path), exist_ok=True)
        with open(abs_path, "wb") as f:
            f.write(entry.bytes_data)

        md["thumbnail_path"] = encode_pdf_meta_asset(rel, page=1)
        md["updated_at"] = now_iso()
        changes += 1

    # backup + write db
    stamp = int(datetime.now().timestamp() * 1000)
    backup = f"{args.db}.bak.{stamp}"
    with open(backup, "w", encoding="utf-8") as f:
        json.dump(db, f, ensure_ascii=False, indent=2)
    with open(args.db, "w", encoding="utf-8") as f:
        json.dump(db, f, ensure_ascii=False, indent=2)
        f.write("\n")

    print("\n=== Applied ===")
    print(f"metadata rows updated with pdf asset: {changes}")
    print(f"metadata rows inserted: {inserted_metadata}")
    print(f"skipped missing entry: {skipped_missing_entry}")
    print(f"db backup: {backup}")

    return 0


if __name__ == "__main__":
    raise SystemExit(main())
