#!/usr/bin/env python3
"""
NZ Employment Law Case Scraper
Downloads all employment law determinations from:
1. Employment Relations Authority (ERA) - determinations.era.govt.nz
2. Employment Court (NZEmpC) - employmentcourt.govt.nz
3. NZLII (nzlii.org) - ERA, Employment Court, and Employment Tribunal archives

Usage:
    python3 scraper.py [--source era|empc|nzlii|all] [--resume] [--dry-run] [--min-space-gb 5]
"""

import os
import re
import sys
import time
import json
import shutil
import hashlib
import logging
import argparse
import subprocess
import tempfile
import unicodedata
from datetime import datetime, date
from pathlib import Path
from urllib.parse import urljoin, quote, unquote

import requests
from bs4 import BeautifulSoup

# === Configuration ===
BASE_DIR = Path("/mnt/FILE/SITES/caselawandreference.workingforworkers.nz/reference")
LOG_FILE = BASE_DIR / "download.log"
STATE_FILE = BASE_DIR / ".scraper_state.json"
MIN_SPACE_GB = 5  # Pause if disk drops below this

ERA_BASE = "https://determinations.era.govt.nz"
ERA_SEARCH_URL = f"{ERA_BASE}/determination-search-page/DeterminationSearchForm"
ERA_RESULTS_PER_PAGE = 10

EMPC_BASE = "https://www.employmentcourt.govt.nz"
EMPC_DECISIONS_URL = f"{EMPC_BASE}/judgments/decisions/"
EMPC_RESULTS_PER_PAGE = 25

NZLII_BASE = "https://www.nzlii.org"
NZLII_DATABASES = {
    "nzera": "/nz/cases/NZERA/",
    "nzempc": "/nz/cases/NZEmpC/",
    "nzempt": "/nz/cases/NZEmpT/",
}
# NZLII blocks bot user-agents with 410 Gone; browser-like headers required
NZLII_HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-NZ,en;q=0.9",
}

# Rate limiting (seconds between requests)
RATE_LIMIT_LISTING = 2.0   # Between listing page fetches
RATE_LIMIT_DOWNLOAD = 1.0  # Between PDF downloads
RATE_LIMIT_RETRY = 30.0    # Wait before retrying after error

MAX_RETRIES = 3
REQUEST_TIMEOUT = 60

# Month name mapping for filename construction
MONTH_NAMES = {
    1: "January", 2: "February", 3: "March", 4: "April",
    5: "May", 6: "June", 7: "July", 8: "August",
    9: "September", 10: "October", 11: "November", 12: "December"
}

# === Logging Setup ===
def setup_logging():
    logger = logging.getLogger("scraper")
    logger.setLevel(logging.DEBUG)

    # File handler
    fh = logging.FileHandler(LOG_FILE, encoding="utf-8")
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(logging.Formatter(
        "%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
    ))

    # Console handler
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"))

    logger.addHandler(fh)
    logger.addHandler(ch)
    return logger

log = setup_logging()

# === Session Setup ===
session = requests.Session()
session.headers.update({
    "User-Agent": "NZ-Employment-Law-Research-Bot/1.0 (Academic research; respectful crawling)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-NZ,en;q=0.9",
})
session.verify = True


# === Utility Functions ===

def check_disk_space():
    """Check available disk space on the target volume."""
    stat = shutil.disk_usage(str(BASE_DIR))
    free_gb = stat.free / (1024 ** 3)
    log.debug(f"Disk space check: {free_gb:.1f} GB free")
    return free_gb


def sanitize_filename(name):
    """
    Convert a case title to the naming convention:
    Party_v_Party_[CaseDetails]_[Year]_CourtCode_Number_(Date).pdf
    """
    # Replace spaces with underscores
    name = name.replace(" ", "_")
    # Replace "/" with "-"
    name = name.replace("/", "-")
    # Remove or replace problematic filesystem characters
    # Keep: &, ', commas, hyphens, underscores, brackets, periods
    name = re.sub(r'[<>:"|?*]', '', name)
    # Collapse multiple underscores
    name = re.sub(r'_+', '_', name)
    # Remove leading/trailing underscores and dots
    name = name.strip('_.')
    return name


def truncate_filename(filename, max_bytes=255):
    """
    Truncate a filename to fit within the filesystem limit (255 bytes on ext4).
    Preserves the .pdf extension and tries to cut at an underscore boundary.
    """
    encoded = filename.encode('utf-8')
    if len(encoded) <= max_bytes:
        return filename
    ext = '.pdf'
    max_stem = max_bytes - len(ext.encode('utf-8'))
    stem = filename[:-len(ext)] if filename.endswith(ext) else filename
    # Binary-safe truncation: cut the stem bytes and decode
    truncated = stem.encode('utf-8')[:max_stem].decode('utf-8', errors='ignore')
    # Try to cut at the last underscore for a cleaner break
    last_us = truncated.rfind('_')
    if last_us > max_stem // 2:
        truncated = truncated[:last_us]
    return truncated + ext


def build_era_filename(title, citation, date_str, office_str=""):
    """
    Build filename from ERA determination metadata.
    Format: Party_v_Party_(Office)_[Year]_NZERA_Number_(DD_Month_YYYY).pdf
    """
    # Parse citation like "[2025] NZERA 370" or "2025 NZERA 370"
    citation_clean = citation.strip()
    cite_match = re.match(r'\[?(\d{4})\]?\s*(NZERA)\s+(\d+)', citation_clean)
    if not cite_match:
        # Fallback: use citation as-is
        cite_part = sanitize_filename(citation_clean)
    else:
        year, court, num = cite_match.groups()
        cite_part = f"[{year}]_{court}_{num}"

    # Parse title (already in "PARTY v PARTY" format from ERA)
    title_clean = title.strip()
    # Convert to mixed case for consistency with existing files
    title_parts = title_clean.split(" v ")
    if len(title_parts) == 2:
        # Try to title-case the parties
        p1 = title_case_party(title_parts[0].strip())
        p2 = title_case_party(title_parts[1].strip())
        title_formatted = f"{p1}_v_{p2}"
    else:
        title_formatted = sanitize_filename(title_clean)

    # Extract office location from office string like "Employment Relations Authority - Auckland"
    office_loc = ""
    if office_str:
        loc_match = re.search(r'- (\w+)', office_str)
        if loc_match:
            office_loc = f"({loc_match.group(1)})"

    # Parse date
    date_part = format_date_for_filename(date_str)

    # Build filename
    parts = [title_formatted]
    if office_loc:
        parts.append(office_loc)
    parts.append(cite_part)
    if date_part:
        parts.append(f"({date_part})")

    filename = "_".join(parts) + ".pdf"
    filename = sanitize_filename(filename)

    # Fix double-encoding issues
    filename = filename.replace("__", "_")

    # Truncate if exceeds filesystem limit
    filename = truncate_filename(filename)

    return filename


def is_anonymised_code(word):
    """
    Check if a short uppercase word is likely an anonymised party code
    rather than a real name. ERA cases use random letter codes (e.g., ABC, GF, XDR)
    for name-suppressed parties. These should stay uppercase.

    Returns True if the word should stay uppercase (is a code),
    False if it's a real name that should be title-cased.
    """
    # Only applies to short words (1-4 letters, all alpha)
    clean = word.strip("()")
    if not clean.isalpha() or len(clean) > 4:
        return False

    # Comprehensive whitelist of real short surnames found in NZ ERA cases.
    # Anything short that's NOT in this list is treated as an anonymised code.
    real_short_names = {
        "ADDY", "ADO", "ALI", "AMIN", "AMOS", "AN", "ASH",
        "BA", "BABE", "BAIG", "BAIN", "BALI", "BAN", "BARR", "BAT", "BATE", "BAY",
        "BEAL", "BEDI", "BEGG", "BEHL", "BELK", "BELL", "BEST",
        "BIAN", "BIN", "BIRD", "BOW", "BOYD", "BRAR", "BULL", "BURN", "BURT",
        "BYE", "BYUN",
        "CAO", "CARR", "CASH", "CHAO", "CHEN", "CHIU", "CHO", "CHOI", "CHUN",
        "COE", "COLE", "COOK",
        "DARD", "DAX", "DAY", "DAYS", "DEAN", "DEED", "DELL", "DENT", "DEVI",
        "DING", "DO", "DODD", "DOMA", "DON", "DU", "DUAN", "DUFF", "DUNN", "DUTT",
        "EDE", "EDO", "ELS", "ESAU", "EWE",
        "FALE", "FAN", "FAY", "FENG", "FISO", "FOO", "FORD", "FOX", "FREW", "FRY",
        "GANG", "GAO", "GE", "GERA", "GILL", "GO", "GONG", "GOOD", "GORE", "GOUK", "GOW",
        "GRAY", "GREY", "GU", "GUAN", "GUO", "GUSH", "GUY",
        "HAIG", "HALL", "HAO", "HART", "HAU", "HE", "HEAD", "HEY", "HIKA", "HILL",
        "HO", "HOGG", "HONG", "HOOD", "HORN", "HOU", "HOW", "HOWE",
        "HU", "HUA", "HUCH", "HUIA", "HULL", "HUNT", "HUTT", "HYDE",
        "ILIN", "ION",
        "JANE", "JAY", "JEON", "JI", "JIA", "JIAN", "JIAO", "JIN", "JOE", "JOO",
        "JUDD", "JUN", "JUNG", "JURY",
        "KAN", "KANG", "KATH", "KAUR", "KAYA", "KEIR", "KEMP", "KENT", "KERR",
        "KHAN", "KIDD", "KIM", "KING", "KIRK", "KNOX", "KO", "KOA", "KOUR",
        "KRUG", "KU", "KUO", "KWON",
        "LA", "LAI", "LAL", "LALL", "LAN", "LANE", "LANZ", "LARK", "LATA", "LAU",
        "LE", "LEE", "LEES", "LEON", "LI", "LIAN", "LIAO", "LIEN", "LIN", "LING",
        "LINO", "LION", "LIU", "LO", "LOLI", "LONG", "LOO", "LOOY", "LOVE", "LOWE",
        "LU", "LUAN", "LUGG", "LUSH", "LYE",
        "MA", "MALA", "MANA", "MANN", "MANU", "MAO", "MARK", "MAY", "MENG", "MIKI",
        "MOKE", "MON", "MONK", "MUIR", "MUMM", "MURU",
        "NAIK", "NAIR", "NATH", "NEAL", "NEL", "NERU", "NI", "NIE", "NIU", "NIX",
        "NODA", "NONU", "NURI",
        "OH", "ONG", "OO", "OPAI", "OWEN", "OZ",
        "PAGE", "PAKU", "PALU", "PAM", "PAN", "PARK", "PARR", "PASK", "PAUL",
        "PECK", "PEEL", "PEHI", "PENG", "PERE", "PIKE", "PINK", "POI", "PUGH", "PULE", "PYNE",
        "QI", "QIAN",
        "RAI", "RAJ", "RANA", "READ", "REES", "REID", "REN", "RICE", "ROCK", "ROE",
        "ROSE", "ROSS", "ROY", "RUSS", "RYAN", "RYU",
        "SAM", "SAN", "SEL", "SELL", "SHAH", "SHAO", "SHAW", "SHEA", "SHEN", "SHI",
        "SHO", "SI", "SIMS", "SIU", "SON", "SONG", "SOSA", "SU", "SUH", "SUN",
        "SWAN", "SYED", "SZE",
        "TAIA", "TANE", "TAWA", "TEN", "TEO", "THUY", "TODD", "TONG", "TORA", "TRAN",
        "TUA", "TUPE", "TYE", "TYER",
        "VILE", "VITA",
        "WANG", "WARD", "WEBB", "WEDD", "WEI", "WEIR", "WEN", "WEST",
        "WILD", "WIN", "WON", "WONG", "WOOD", "WOUD", "WU", "WYND",
        "XIAO", "XIE", "XU",
        "YAN", "YANG", "YAO", "YEE", "YIN", "YOO", "YOON", "YU", "YUAN",
        "ZENG", "ZHAO", "ZHEN", "ZHOU", "ZHU", "ZHUO", "ZINK",
    }

    # Common English words and legal terms that appear in party names
    # and should NOT be treated as anonymised codes
    common_words = {
        "ORS", "ANOR", "NEW", "OLD", "ALL", "OUR", "HIS", "HER", "ITS", "NOT",
        "BUT", "NOR", "YET", "SO", "IF", "UP", "OUT", "OFF", "ONE", "TWO",
        "OWN", "ANY", "FEW", "NET", "SET", "RUN", "LET", "CUT", "PUT", "GOT",
        "SAY", "SEE", "END", "USE", "ADD", "TRY", "ASK", "ACT", "GET", "SIT",
        "TOP", "LAW", "DAM", "BAR", "CAR", "WAY", "KEY", "JOB", "AID", "MAN",
        "MEN", "BOY", "SON", "SIR", "MRS", "MR", "MS", "DR",
        "CO", "NO", "RE", "EX",
        "FIRE", "CITY", "RAIL", "EAST", "WEST", "ISLE", "LAND", "LAKE", "PARK",
        "VICE", "CLUB", "BANK", "CARE", "HOME", "FUND", "WINE", "AUTO",
        "BEST", "FOUR", "FIVE", "GOLD", "BLUE", "REAL",
    }

    return clean.upper() not in real_short_names and clean.upper() not in common_words


def title_case_party(name):
    """
    Convert UPPERCASE party name to Title Case, respecting common patterns.
    E.g., "SMITH" -> "Smith", "NEW ZEALAND POST LIMITED" -> "New_Zealand_Post_Limited"
    Anonymised letter codes (e.g., "ABC", "GF") are kept uppercase.
    """
    # Words that should stay uppercase
    stay_upper = {"NZ", "LLC", "LP", "PTE", "CEO", "CFO", "HR", "IT", "DHB", "MPI",
                  "MBIE", "MOH", "ACC", "IRD", "MSD", "DOC", "MOJ"}
    # Words that should be lowercase (articles/prepositions in legal names)
    lowercase_words = {"OF", "THE", "AND", "IN", "ON", "AT", "TO", "FOR", "BY", "AS",
                       "OR", "AN", "A", "WITH", "FROM"}
    # Words with special replacements
    special = {"T/A": "t-a", "TA": "t-a", "LIMITED": "Limited", "LTD": "Ltd",
               "INCORPORATED": "Incorporated", "INC": "Inc"}

    words = name.split()
    result = []
    for i, w in enumerate(words):
        w_upper = w.upper()
        if w_upper in special:
            result.append(special[w_upper])
        elif w_upper in stay_upper:
            result.append(w_upper)
        elif w_upper in lowercase_words and i > 0:
            # Keep lowercase unless first word
            result.append(w.lower())
        elif is_anonymised_code(w):
            # Anonymised party codes stay uppercase (e.g., ABC, GF, XDR)
            # Handle parenthesised codes like (NZ)
            if w.startswith("("):
                result.append("(" + w[1:].strip(")").upper() + ")")
            else:
                result.append(w.upper())
        elif w.startswith("(") and len(w) > 1:
            # Parenthesized words: capitalize content
            result.append("(" + w[1:].capitalize())
        else:
            result.append(w.capitalize())

    return "_".join(result)


def format_date_for_filename(date_str):
    """Convert a date string to DD_Month_YYYY format."""
    if not date_str:
        return ""

    date_str = date_str.strip()

    # Try various date formats
    for fmt in ["%d %B %Y", "%d %b %Y", "%Y-%m-%d", "%d/%m/%Y", "%B %d, %Y"]:
        try:
            dt = datetime.strptime(date_str, fmt)
            return f"{dt.day}_{MONTH_NAMES[dt.month]}_{dt.year}"
        except ValueError:
            continue

    # Fallback: just sanitize
    return sanitize_filename(date_str)


def build_empc_filename(title, citation, date_str):
    """
    Build filename from Employment Court decision metadata.
    Format: Party_v_Party_[Year]_NZEmpC_Number_(DD_Month_YYYY).pdf
    """
    # Parse citation like "[2026] NZEmpC 27"
    citation_clean = citation.strip()
    cite_match = re.match(r'\[?(\d{4})\]?\s*(NZEmpC)\s+(\d+)', citation_clean)
    if cite_match:
        year, court, num = cite_match.groups()
        cite_part = f"[{year}]_{court}_{num}"
    else:
        cite_part = sanitize_filename(citation_clean)

    # Parse title
    title_formatted = sanitize_filename(title.strip())

    # Parse date
    date_part = format_date_for_filename(date_str)

    parts = [title_formatted, cite_part]
    if date_part:
        parts.append(f"({date_part})")

    filename = "_".join(parts) + ".pdf"
    filename = sanitize_filename(filename)
    filename = filename.replace("__", "_")

    # Truncate if exceeds filesystem limit
    filename = truncate_filename(filename)

    return filename


# === File Index (built once at startup, updated on download) ===

class FileIndex:
    """
    In-memory index of existing files for fast duplicate detection.
    Built once at startup by scanning the directory, then updated incrementally
    as new files are downloaded. Avoids repeated directory scans (O(1) vs O(n)).
    """

    def __init__(self):
        self._fuzzy_stems = set()   # lowered, stripped stems for fuzzy filename matching
        self._citations = set()     # normalized citation strings found in filenames
        self._exact_names = set()   # exact filenames for direct check

    def build(self, directory):
        """Scan directory once and build the index."""
        count = 0
        for f in directory.iterdir():
            if f.is_file():
                self._add_file(f.name)
                count += 1
        log.info(f"File index built: {count} files, {len(self._citations)} citations indexed")

    def _add_file(self, filename):
        """Add a single filename to the index."""
        self._exact_names.add(filename)

        # Fuzzy stem: lowercase, strip underscores/hyphens/spaces, remove extension
        stem = Path(filename).stem.lower().replace("_", "").replace("-", "").replace(" ", "")
        self._fuzzy_stems.add(stem)

        # Extract citations from the filename
        # Matches patterns like [2025]_NZERA_370 or [2021]_NZEmpC_15
        for m in re.finditer(r'\[?(\d{4})\]?[_\s]*(NZERA|NZEmpC|NZEmpT)[_\s]*(\d+)', filename):
            cite = f"{m.group(1)}_{m.group(2)}_{m.group(3)}"
            self._citations.add(cite)

    def add_downloaded(self, filename):
        """Register a newly downloaded file in the index."""
        self._add_file(filename)

    def file_already_exists(self, filename):
        """Check if a file (or close variant) already exists."""
        if filename in self._exact_names:
            return True

        # Fuzzy match
        stem = Path(filename).stem.lower().replace("_", "").replace("-", "").replace(" ", "")
        return stem in self._fuzzy_stems

    def check_existing_by_citation(self, citation):
        """Check if we already have a file with this citation."""
        if not citation:
            return False

        # Normalize: "[2025] NZERA 370" -> "2025_NZERA_370"
        cite_normalized = re.sub(r'[\[\]\s]+', '_', citation.strip()).strip('_')
        return cite_normalized in self._citations


# Global index instance - built in main() before scraping starts
file_index = FileIndex()


def file_already_exists(filename):
    """Check if a file (or close variant) already exists."""
    return file_index.file_already_exists(filename)


def check_existing_by_citation(citation):
    """Check if we already have a file with this citation in the name."""
    return file_index.check_existing_by_citation(citation)


def download_file(url, filename, dry_run=False):
    """Download a file with retry logic."""
    target = BASE_DIR / filename

    if target.exists():
        log.info(f"SKIP (exists): {filename}")
        return "skipped"

    if dry_run:
        log.info(f"DRY RUN: Would download {filename} from {url}")
        return "dry_run"

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = session.get(url, timeout=REQUEST_TIMEOUT, stream=True)
            resp.raise_for_status()

            # Check content type
            content_type = resp.headers.get("Content-Type", "")

            # Save the file
            with open(target, "wb") as f:
                for chunk in resp.iter_content(chunk_size=8192):
                    f.write(chunk)

            file_size = target.stat().st_size
            log.info(f"DOWNLOADED: {filename} ({file_size:,} bytes)")
            file_index.add_downloaded(filename)

            # Check disk space after download
            free_gb = check_disk_space()
            log.debug(f"Space after download: {free_gb:.1f} GB free")

            return "downloaded"

        except requests.exceptions.RequestException as e:
            log.warning(f"Download attempt {attempt}/{MAX_RETRIES} failed for {filename}: {e}")
            if attempt < MAX_RETRIES:
                time.sleep(RATE_LIMIT_RETRY)
            else:
                log.error(f"FAILED after {MAX_RETRIES} attempts: {filename} from {url}")
                return "failed"

    return "failed"


def fetch_page(url, params=None):
    """Fetch a page with retry logic."""
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = session.get(url, params=params, timeout=REQUEST_TIMEOUT)
            resp.raise_for_status()
            return resp.text
        except requests.exceptions.RequestException as e:
            log.warning(f"Page fetch attempt {attempt}/{MAX_RETRIES} failed for {url}: {e}")
            if attempt < MAX_RETRIES:
                time.sleep(RATE_LIMIT_RETRY)
            else:
                log.error(f"FAILED to fetch page after {MAX_RETRIES} attempts: {url}")
                return None
    return None


# === State Management (for resume) ===

def load_state():
    """Load scraper state for resume capability."""
    if STATE_FILE.exists():
        with open(STATE_FILE) as f:
            return json.load(f)
    return {"era_offset": 0, "empc_offset": 0, "downloaded": [], "failed": [], "skipped": []}


def save_state(state):
    """Save current scraper state."""
    with open(STATE_FILE, "w") as f:
        json.dump(state, f, indent=2)


# === ERA Scraper ===

def scrape_era_page(offset):
    """Scrape one page of ERA search results."""
    params = {
        "Parties": "",
        "Keywords": "",
        "DateFrom": "",
        "DateTo": "",
        "action_doSearch": "Search",
        "start": str(offset),
    }

    html = fetch_page(ERA_SEARCH_URL, params)
    if not html:
        return [], False

    soup = BeautifulSoup(html, "lxml")

    # Extract total results
    results_text = ""
    for p in soup.find_all("p"):
        text = p.get_text().strip()
        if re.match(r"Results \d+-\d+ of \d+", text):
            results_text = text
            break

    total_match = re.search(r"of (\d+)", results_text)
    total = int(total_match.group(1)) if total_match else 0

    # Extract individual results
    entries = []
    records = soup.find_all("li", class_="search-results__record")

    for record in records:
        entry = {}

        # Title
        title_el = record.find("p", class_="search-results__record__title")
        if title_el and title_el.find("a"):
            entry["title"] = title_el.find("a").get_text().strip()
            entry["detail_url"] = title_el.find("a").get("href", "")

        # Office/Member info
        paragraphs = record.find_all("p")
        if len(paragraphs) > 1:
            entry["office"] = paragraphs[1].get_text().strip()

        # Data items (PDF link, date, citation)
        data_items = record.find_all("li", class_="p-0")
        for item in data_items:
            a = item.find("a")
            if a and ".pdf" in str(a.get("href", "")):
                entry["pdf_url"] = a.get("href", "")
                entry["pdf_text"] = a.get_text().strip()
            else:
                text = item.get_text().strip()
                if re.match(r'\[?\d{4}\]?\s*NZERA\s+\d+', text):
                    entry["citation"] = text
                elif re.match(r'\d{1,2}\s+\w+\s+\d{4}', text):
                    entry["date"] = text

        if entry.get("title"):
            entries.append(entry)

    has_more = offset + ERA_RESULTS_PER_PAGE < total
    return entries, has_more


def scrape_era(state, dry_run=False):
    """Scrape all ERA determinations."""
    log.info("=" * 60)
    log.info("Starting ERA scraper")
    log.info("=" * 60)

    offset = state.get("era_offset", 0)
    stats = {"downloaded": 0, "skipped": 0, "failed": 0, "no_pdf": 0}

    while True:
        # Check disk space
        free_gb = check_disk_space()
        if free_gb < MIN_SPACE_GB:
            log.warning(f"LOW DISK SPACE: {free_gb:.1f} GB free (threshold: {MIN_SPACE_GB} GB)")
            log.warning("Pausing scraper. Free up space and run with --resume to continue.")
            break

        log.info(f"ERA: Fetching page at offset {offset}...")
        entries, has_more = scrape_era_page(offset)

        if not entries and not has_more:
            log.info(f"ERA: No more results at offset {offset}")
            break

        for entry in entries:
            title = entry.get("title", "Unknown")
            citation = entry.get("citation", "")
            date_str = entry.get("date", "")
            office = entry.get("office", "")
            pdf_url = entry.get("pdf_url", "")

            if not pdf_url:
                log.debug(f"ERA: No PDF for: {title} {citation}")
                stats["no_pdf"] += 1
                continue

            # Build filename
            filename = build_era_filename(title, citation, date_str, office)

            # Check if already exists (by filename or by citation)
            if file_already_exists(filename) or check_existing_by_citation(citation):
                log.debug(f"ERA SKIP (exists): {citation} - {title}")
                stats["skipped"] += 1
                continue

            # Build full PDF URL
            if pdf_url.startswith("/"):
                full_url = ERA_BASE + pdf_url
            else:
                full_url = pdf_url

            # Download
            result = download_file(full_url, filename, dry_run)
            if result == "downloaded":
                stats["downloaded"] += 1
            elif result == "skipped":
                stats["skipped"] += 1
            elif result == "failed":
                stats["failed"] += 1
                state.setdefault("failed", []).append({
                    "url": full_url, "filename": filename, "citation": citation
                })

            time.sleep(RATE_LIMIT_DOWNLOAD)

        offset += ERA_RESULTS_PER_PAGE
        state["era_offset"] = offset
        save_state(state)

        if not has_more:
            log.info("ERA: Reached end of results")
            break

        time.sleep(RATE_LIMIT_LISTING)

    log.info(f"ERA complete: {stats['downloaded']} downloaded, {stats['skipped']} skipped, "
             f"{stats['failed']} failed, {stats['no_pdf']} no PDF available")
    return stats


# === Employment Court Scraper ===

def scrape_empc_page(offset):
    """Scrape one page of Employment Court decisions."""
    url = f"{EMPC_DECISIONS_URL}?Filter_Jurisdiction=17&start={offset}"

    html = fetch_page(url)
    if not html:
        return [], False

    soup = BeautifulSoup(html, "lxml")

    entries = []

    # Find all decision links (PDF links with /assets/Documents/Decisions/)
    for a in soup.find_all("a", href=re.compile(r"/assets/Documents/Decisions/.*\.pdf")):
        entry = {}
        pdf_url = a.get("href", "")
        entry["pdf_url"] = pdf_url

        # Try to extract citation and title from the header inside the link
        header = a.find("h4")
        if header:
            header_text = header.get_text().strip()
            # Citation is usually like "[2026] NZEmpC 27"
            cite_match = re.search(r'\[(\d{4})\]\s*(NZEmpC)\s+(\d+)', header_text)
            if cite_match:
                entry["citation"] = f"[{cite_match.group(1)}] {cite_match.group(2)} {cite_match.group(3)}"
                # Title is whatever comes after the citation
                remaining = header_text[cite_match.end():].strip()
                if remaining:
                    entry["title"] = remaining.strip(" -–")
                else:
                    # Try to extract from PDF filename
                    pdf_name = os.path.basename(unquote(pdf_url))
                    # Remove .pdf and common prefixes
                    name_clean = re.sub(r'\.pdf$', '', pdf_name)
                    name_clean = re.sub(r'^\d{4}-NZEmpC-\d+-', '', name_clean)
                    name_clean = re.sub(r'^EMPC-\d+-\d+-', '', name_clean)
                    entry["title"] = name_clean.replace("-", " ").strip()

        # Try to find date in surrounding context
        parent = a.find_parent()
        if parent:
            text = parent.get_text()
            date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', text)
            if date_match:
                entry["date"] = date_match.group(1)

        if entry.get("pdf_url"):
            entries.append(entry)

    # Check pagination - is there a next page?
    has_more = False
    next_link = soup.find("a", string=re.compile(r"Next"))
    if next_link:
        has_more = True

    # De-duplicate (each PDF link appears twice in the HTML)
    seen_urls = set()
    unique_entries = []
    for e in entries:
        if e["pdf_url"] not in seen_urls:
            seen_urls.add(e["pdf_url"])
            unique_entries.append(e)

    return unique_entries, has_more


def scrape_empc(state, dry_run=False):
    """Scrape all Employment Court decisions."""
    log.info("=" * 60)
    log.info("Starting Employment Court scraper")
    log.info("=" * 60)

    offset = state.get("empc_offset", 0)
    stats = {"downloaded": 0, "skipped": 0, "failed": 0}

    while True:
        # Check disk space
        free_gb = check_disk_space()
        if free_gb < MIN_SPACE_GB:
            log.warning(f"LOW DISK SPACE: {free_gb:.1f} GB free (threshold: {MIN_SPACE_GB} GB)")
            log.warning("Pausing scraper. Free up space and run with --resume to continue.")
            break

        log.info(f"EMPC: Fetching page at offset {offset}...")
        entries, has_more = scrape_empc_page(offset)

        if not entries and not has_more:
            log.info(f"EMPC: No more results at offset {offset}")
            break

        for entry in entries:
            pdf_url = entry.get("pdf_url", "")
            citation = entry.get("citation", "")
            title = entry.get("title", "")
            date_str = entry.get("date", "")

            if not pdf_url:
                continue

            # Build full URL
            if pdf_url.startswith("/"):
                full_url = EMPC_BASE + pdf_url
            elif pdf_url.startswith("http"):
                full_url = pdf_url
            else:
                full_url = EMPC_BASE + "/" + pdf_url

            # Build filename
            if citation and title:
                filename = build_empc_filename(title, citation, date_str)
            else:
                # Fallback: use the original PDF filename from the URL, sanitized
                orig_name = os.path.basename(unquote(pdf_url))
                filename = sanitize_filename(orig_name)

            # Check if already exists
            if file_already_exists(filename) or check_existing_by_citation(citation):
                log.debug(f"EMPC SKIP (exists): {citation} - {title}")
                stats["skipped"] += 1
                continue

            # Download
            result = download_file(full_url, filename, dry_run)
            if result == "downloaded":
                stats["downloaded"] += 1
            elif result == "skipped":
                stats["skipped"] += 1
            elif result == "failed":
                stats["failed"] += 1
                state.setdefault("failed", []).append({
                    "url": full_url, "filename": filename, "citation": citation
                })

            time.sleep(RATE_LIMIT_DOWNLOAD)

        offset += EMPC_RESULTS_PER_PAGE
        state["empc_offset"] = offset
        save_state(state)

        if not has_more:
            log.info("EMPC: Reached end of results")
            break

        time.sleep(RATE_LIMIT_LISTING)

    log.info(f"EMPC complete: {stats['downloaded']} downloaded, {stats['skipped']} skipped, "
             f"{stats['failed']} failed")
    return stats


# === NZLII Scraper ===

# Separate session for NZLII with browser-like headers
nzlii_session = requests.Session()
nzlii_session.headers.update(NZLII_HEADERS)
nzlii_session.verify = False  # NZLII has certificate issues


def nzlii_fetch_page(url, referer=None):
    """Fetch a page from NZLII with browser-like headers and retry logic."""
    headers = {}
    if referer:
        headers["Referer"] = referer

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = nzlii_session.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
            if resp.status_code == 410:
                log.warning(f"NZLII returned 410 for {url} - may need header adjustment")
                return None
            resp.raise_for_status()
            return resp.text
        except requests.exceptions.RequestException as e:
            log.warning(f"NZLII fetch attempt {attempt}/{MAX_RETRIES} failed for {url}: {e}")
            if attempt < MAX_RETRIES:
                time.sleep(RATE_LIMIT_RETRY)
            else:
                log.error(f"FAILED to fetch NZLII page after {MAX_RETRIES} attempts: {url}")
                return None
    return None


def nzlii_get_years(db_path):
    """Get list of available years for a NZLII database."""
    url = NZLII_BASE + db_path
    html = nzlii_fetch_page(url, referer=NZLII_BASE + "/")
    if not html:
        return []

    soup = BeautifulSoup(html, "lxml")
    years = []
    for a in soup.find_all("a", href=re.compile(r'^\d{4}/$')):
        year_str = a.get("href", "").strip("/")
        if year_str.isdigit():
            years.append(int(year_str))

    years.sort()
    return years


def nzlii_get_year_entries(db_path, year):
    """Get all decision entries for a given year from NZLII."""
    url = f"{NZLII_BASE}{db_path}{year}/"
    referer = NZLII_BASE + db_path
    html = nzlii_fetch_page(url, referer=referer)
    if not html:
        return []

    soup = BeautifulSoup(html, "lxml")
    entries = []

    for li in soup.find_all("li", class_="make-database"):
        a = li.find("a", class_="make-database")
        if not a:
            continue

        link_text = a.get_text().strip()
        href = a.get("href", "")

        # Parse link text like: "DQJ v KSW [2026] NZERA 44 (28 January 2026)"
        # or: "A v Lowe Corporation Ltd WA 1/05 (Wellington) [2005] NZERA 183 (5 January 2005)"
        entry = {"link_text": link_text, "href": href}

        # Extract citation: [Year] COURT Num
        cite_match = re.search(r'\[(\d{4})\]\s+(NZERA|NZEmpC|NZEmpT)\s+(\d+)', link_text)
        if cite_match:
            entry["year"] = cite_match.group(1)
            entry["court"] = cite_match.group(2)
            entry["number"] = cite_match.group(3)
            entry["citation"] = f"[{cite_match.group(1)}] {cite_match.group(2)} {cite_match.group(3)}"

        # Extract date: (DD Month YYYY) at the end
        date_match = re.search(r'\((\d{1,2}\s+\w+\s+\d{4})\)\s*$', link_text)
        if date_match:
            entry["date"] = date_match.group(1)

        # Extract title: everything before the citation
        if cite_match:
            title = link_text[:cite_match.start()].strip()
            # Remove old-style case numbers like "WA 1/05" or "AA 2/05" or "CA 3/05"
            title = re.sub(r'\s+[WCA]A\s+\d+/\d+\s*', ' ', title).strip()
            # Remove location in parens if it appears before citation (old format)
            title = re.sub(r'\s*\((Auckland|Wellington|Christchurch)\)\s*$', '', title).strip()
            entry["title"] = title

        # Build HTML page URL
        if href:
            # href is usually like "../2026/44.html"
            if href.startswith(".."):
                entry["html_url"] = f"{NZLII_BASE}{db_path}{href.lstrip('../')}"
            elif href.startswith("/"):
                entry["html_url"] = NZLII_BASE + href
            else:
                entry["html_url"] = f"{NZLII_BASE}{db_path}{year}/{href}"

        if entry.get("citation"):
            entries.append(entry)

    return entries


def nzlii_format_party(name):
    """
    Format a party name from NZLII for use in filenames.
    NZLII titles are already correctly cased (mixed case), unlike ERA which is ALL CAPS.
    We just need to replace spaces with underscores and handle t/a formatting.
    """
    # Handle t/a (trading as)
    name = re.sub(r'\s+t/a\s+', ' t-a ', name, flags=re.IGNORECASE)
    # Replace spaces with underscores
    result = sanitize_filename(name)
    return result


def build_nzlii_filename(entry):
    """
    Build a filename from an NZLII entry.
    Format: Party_v_Party_[Year]_COURT_Number_(DD_Month_YYYY).pdf
    """
    title = entry.get("title", "")
    citation = entry.get("citation", "")
    date_str = entry.get("date", "")
    court = entry.get("court", "")
    year = entry.get("year", "")
    number = entry.get("number", "")

    # Process party names - NZLII provides correctly-cased titles
    title_parts = title.split(" v ")
    if len(title_parts) == 2:
        p1 = nzlii_format_party(title_parts[0].strip())
        p2 = nzlii_format_party(title_parts[1].strip())
        title_formatted = f"{p1}_v_{p2}"
    elif " v " in title:
        idx = title.index(" v ")
        p1 = nzlii_format_party(title[:idx].strip())
        p2 = nzlii_format_party(title[idx+3:].strip())
        title_formatted = f"{p1}_v_{p2}"
    else:
        title_formatted = sanitize_filename(title)

    # Citation part
    cite_part = f"[{year}]_{court}_{number}" if year and court and number else sanitize_filename(citation)

    # Date part
    date_part = format_date_for_filename(date_str)

    # Build filename
    parts = [title_formatted, cite_part]
    if date_part:
        parts.append(f"({date_part})")

    filename = "_".join(parts) + ".pdf"
    filename = sanitize_filename(filename)
    filename = filename.replace("__", "_")

    # Truncate if exceeds filesystem limit
    filename = truncate_filename(filename)

    return filename


def _nzlii_download_direct_file(file_url, target, referer=None):
    """Download a PDF or RTF file directly from NZLII, converting RTF to PDF if needed."""
    is_rtf = file_url.lower().endswith(".rtf")
    headers = {"Accept": "application/rtf,*/*" if is_rtf else "application/pdf,*/*"}
    if referer:
        headers["Referer"] = referer

    resp = nzlii_session.get(file_url, headers=headers, timeout=REQUEST_TIMEOUT)
    if resp.status_code == 410:
        log.warning(f"NZLII returned 410 for direct file {file_url}")
        return False
    resp.raise_for_status()

    if is_rtf:
        # Save RTF to temp file, convert to PDF with libreoffice
        with tempfile.NamedTemporaryFile(suffix=".rtf", delete=False) as tmp:
            tmp.write(resp.content)
            tmp_path = tmp.name
        try:
            out_dir = str(target.parent)
            result = subprocess.run(
                ["libreoffice", "--headless", "--convert-to", "pdf",
                 "--outdir", out_dir, tmp_path],
                capture_output=True, timeout=120
            )
            # libreoffice names output after the input file
            lo_output = Path(out_dir) / (Path(tmp_path).stem + ".pdf")
            if lo_output.exists():
                lo_output.rename(target)
                return True
            # Fallback: try wkhtmltopdf on the RTF
            log.warning(f"LibreOffice conversion failed, trying wkhtmltopdf")
            subprocess.run(
                ["wkhtmltopdf", "--quiet", tmp_path, str(target)],
                capture_output=True, timeout=120
            )
            return target.exists() and target.stat().st_size > 100
        finally:
            try:
                os.unlink(tmp_path)
            except OSError:
                pass
    else:
        # Direct PDF download
        with open(target, "wb") as f:
            f.write(resp.content)
        return target.exists() and target.stat().st_size > 100


def nzlii_download_as_pdf(html_url, filename, referer=None, dry_run=False):
    """Download an NZLII decision HTML page and convert it to PDF.
    Handles three NZLII page types:
    1. Embedded PDF via <object data="...pdf"> - downloads the PDF directly
    2. RTF download link via <a href="...rtf"> - downloads RTF and converts to PDF
    3. Inline HTML content - converts to PDF with wkhtmltopdf
    """
    target = BASE_DIR / filename

    if target.exists():
        log.info(f"SKIP (exists): {filename}")
        return "skipped"

    if dry_run:
        log.info(f"DRY RUN: Would download {filename} from {html_url}")
        return "dry_run"

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            headers = {}
            if referer:
                headers["Referer"] = referer

            resp = nzlii_session.get(html_url, headers=headers, timeout=REQUEST_TIMEOUT)
            if resp.status_code == 410:
                log.warning(f"NZLII returned 410 for {html_url}")
                return "failed"
            resp.raise_for_status()

            html_content = resp.text
            soup = BeautifulSoup(html_content, "lxml")

            # --- Check for embedded PDF via <object data="...pdf"> ---
            obj_tag = soup.find("object", attrs={"data": re.compile(r'\.pdf$', re.I)})
            if obj_tag:
                pdf_data_url = obj_tag.get("data", "")
                if pdf_data_url:
                    # Resolve relative/absolute URL
                    if pdf_data_url.startswith("/"):
                        pdf_direct_url = NZLII_BASE + pdf_data_url
                    elif pdf_data_url.startswith("http"):
                        pdf_direct_url = pdf_data_url
                    else:
                        # Relative to current page directory
                        base_dir_url = html_url.rsplit("/", 1)[0] + "/"
                        pdf_direct_url = base_dir_url + pdf_data_url

                    log.debug(f"Found embedded PDF: {pdf_direct_url}")
                    if _nzlii_download_direct_file(pdf_direct_url, target, referer=html_url):
                        file_size = target.stat().st_size
                        log.info(f"DOWNLOADED (direct PDF): {filename} ({file_size:,} bytes)")
                        file_index.add_downloaded(filename)
                        return "downloaded"
                    else:
                        log.warning(f"Direct PDF download failed for {filename}, "
                                    f"falling through to HTML conversion")

            # --- Check for RTF download link ---
            rtf_link = soup.find("a", href=re.compile(r'\.rtf$', re.I))
            if rtf_link:
                rtf_href = rtf_link.get("href", "")
                if rtf_href:
                    if rtf_href.startswith("/"):
                        rtf_url = NZLII_BASE + rtf_href
                    elif rtf_href.startswith("http"):
                        rtf_url = rtf_href
                    else:
                        base_dir_url = html_url.rsplit("/", 1)[0] + "/"
                        rtf_url = base_dir_url + rtf_href

                    log.debug(f"Found RTF link: {rtf_url}")
                    if _nzlii_download_direct_file(rtf_url, target, referer=html_url):
                        file_size = target.stat().st_size
                        log.info(f"DOWNLOADED (RTF->PDF): {filename} ({file_size:,} bytes)")
                        file_index.add_downloaded(filename)
                        return "downloaded"
                    else:
                        log.warning(f"RTF download/conversion failed for {filename}, "
                                    f"falling through to HTML conversion")

            # --- Inline HTML content: convert to PDF with wkhtmltopdf ---
            with tempfile.NamedTemporaryFile(suffix=".html", mode="w",
                                             encoding="utf-8", delete=False) as tmp:
                tmp.write(html_content)
                tmp_path = tmp.name

            try:
                result = subprocess.run(
                    ["wkhtmltopdf", "--quiet", "--no-stop-slow-scripts",
                     "--disable-javascript", "--encoding", "utf-8",
                     tmp_path, str(target)],
                    capture_output=True, timeout=120
                )

                if result.returncode != 0 and not target.exists():
                    stderr = result.stderr.decode("utf-8", errors="replace")
                    log.warning(f"wkhtmltopdf failed for {filename}: {stderr[:200]}")
                    # wkhtmltopdf often returns non-zero but still creates the file
                    if not target.exists():
                        return "failed"

                if target.exists():
                    file_size = target.stat().st_size
                    if file_size < 100:
                        # Probably an empty/broken PDF
                        target.unlink()
                        log.warning(f"Produced empty PDF for {filename}, skipping")
                        return "failed"

                    log.info(f"DOWNLOADED: {filename} ({file_size:,} bytes)")
                    file_index.add_downloaded(filename)
                    free_gb = check_disk_space()
                    log.debug(f"Space after download: {free_gb:.1f} GB free")
                    return "downloaded"
                else:
                    return "failed"

            finally:
                try:
                    os.unlink(tmp_path)
                except OSError:
                    pass

        except subprocess.TimeoutExpired:
            log.warning(f"wkhtmltopdf timed out for {filename}")
            if attempt < MAX_RETRIES:
                time.sleep(RATE_LIMIT_RETRY)
            else:
                return "failed"
        except requests.exceptions.RequestException as e:
            log.warning(f"NZLII download attempt {attempt}/{MAX_RETRIES} failed for {filename}: {e}")
            if attempt < MAX_RETRIES:
                time.sleep(RATE_LIMIT_RETRY)
            else:
                log.error(f"FAILED after {MAX_RETRIES} attempts: {filename} from {html_url}")
                return "failed"

    return "failed"


def scrape_nzlii(state, dry_run=False):
    """Scrape all employment law decisions from NZLII."""
    log.info("=" * 60)
    log.info("Starting NZLII scraper")
    log.info("=" * 60)

    nzlii_state = state.get("nzlii", {})
    total_stats = {"downloaded": 0, "skipped": 0, "failed": 0}

    for db_key, db_path in NZLII_DATABASES.items():
        db_state = nzlii_state.get(db_key, {"last_year": 0, "last_num": 0})
        last_year = db_state.get("last_year", 0)
        last_num = db_state.get("last_num", 0)

        log.info(f"NZLII {db_key}: Fetching available years...")
        years = nzlii_get_years(db_path)
        if not years:
            log.warning(f"NZLII {db_key}: No years found")
            continue

        log.info(f"NZLII {db_key}: Found years {years[0]}-{years[-1]} ({len(years)} years)")

        for year in years:
            # Resume support: skip years already processed
            if year < last_year:
                continue

            # Check disk space
            free_gb = check_disk_space()
            if free_gb < MIN_SPACE_GB:
                log.warning(f"LOW DISK SPACE: {free_gb:.1f} GB free (threshold: {MIN_SPACE_GB} GB)")
                log.warning("Pausing scraper. Free up space and run with --resume to continue.")
                save_state(state)
                return total_stats

            log.info(f"NZLII {db_key}: Processing year {year}...")
            time.sleep(RATE_LIMIT_LISTING)

            entries = nzlii_get_year_entries(db_path, year)
            if not entries:
                log.debug(f"NZLII {db_key}/{year}: No entries found")
                continue

            log.info(f"NZLII {db_key}/{year}: Found {len(entries)} decisions")

            for entry in entries:
                # Resume support: skip entries already processed within the resume year
                entry_num = int(entry.get("number", 0))
                if year == last_year and entry_num <= last_num:
                    continue

                citation = entry.get("citation", "")
                html_url = entry.get("html_url", "")

                if not html_url:
                    continue

                # Check if already exists (by citation match against existing files)
                if check_existing_by_citation(citation):
                    log.debug(f"NZLII SKIP (exists): {citation}")
                    total_stats["skipped"] += 1
                    continue

                # Build filename
                filename = build_nzlii_filename(entry)

                # Check if file already exists by name
                if file_already_exists(filename):
                    log.debug(f"NZLII SKIP (exists): {filename}")
                    total_stats["skipped"] += 1
                    continue

                # Download and convert to PDF
                referer = f"{NZLII_BASE}{db_path}{year}/"
                result = nzlii_download_as_pdf(html_url, filename, referer=referer, dry_run=dry_run)

                if result == "downloaded":
                    total_stats["downloaded"] += 1
                    # Check disk space every 10 downloads
                    if total_stats["downloaded"] % 10 == 0:
                        free_gb = check_disk_space()
                        if free_gb < MIN_SPACE_GB:
                            log.warning(f"LOW DISK SPACE: {free_gb:.1f} GB free (threshold: {MIN_SPACE_GB} GB)")
                            log.warning("Pausing scraper. Free up space and run with --resume to continue.")
                            db_state["last_year"] = year
                            db_state["last_num"] = entry_num
                            nzlii_state[db_key] = db_state
                            state["nzlii"] = nzlii_state
                            save_state(state)
                            return total_stats
                elif result == "skipped":
                    total_stats["skipped"] += 1
                elif result == "failed":
                    total_stats["failed"] += 1
                    state.setdefault("failed", []).append({
                        "url": html_url, "filename": filename, "citation": citation,
                        "source": f"nzlii-{db_key}"
                    })

                time.sleep(RATE_LIMIT_DOWNLOAD)

            # Update state after each year
            db_state["last_year"] = year
            db_state["last_num"] = 0  # Reset number tracking for next year
            nzlii_state[db_key] = db_state
            state["nzlii"] = nzlii_state
            save_state(state)

        log.info(f"NZLII {db_key} complete")

    log.info(f"NZLII complete: {total_stats['downloaded']} downloaded, "
             f"{total_stats['skipped']} skipped, {total_stats['failed']} failed")
    return total_stats


# === Main ===

def main():
    parser = argparse.ArgumentParser(description="NZ Employment Law Case Scraper")
    parser.add_argument("--source", choices=["era", "empc", "nzlii", "all"], default="all",
                        help="Which source to scrape (default: all)")
    parser.add_argument("--resume", action="store_true",
                        help="Resume from last saved state")
    parser.add_argument("--dry-run", action="store_true",
                        help="Don't actually download, just log what would be downloaded")
    parser.add_argument("--min-space-gb", type=float, default=5,
                        help="Minimum free disk space in GB before pausing (default: 5)")

    args = parser.parse_args()

    global MIN_SPACE_GB
    MIN_SPACE_GB = args.min_space_gb

    log.info("=" * 60)
    log.info("NZ Employment Law Case Scraper")
    log.info(f"Source: {args.source}")
    log.info(f"Resume: {args.resume}")
    log.info(f"Dry run: {args.dry_run}")
    log.info(f"Min space: {args.min_space_gb} GB")
    log.info(f"Target dir: {BASE_DIR}")
    log.info("=" * 60)

    # Check initial disk space
    free_gb = check_disk_space()
    log.info(f"Initial disk space: {free_gb:.1f} GB free")
    if free_gb < MIN_SPACE_GB:
        log.error(f"Insufficient disk space: {free_gb:.1f} GB (need {MIN_SPACE_GB} GB)")
        sys.exit(1)

    # Build in-memory file index for fast duplicate detection
    file_index.build(BASE_DIR)
    existing = len([n for n in file_index._exact_names if n.endswith(".pdf")])
    log.info(f"Existing PDF files: {existing}")

    # Load or create state
    if args.resume:
        state = load_state()
        nzlii_st = state.get("nzlii", {})
        log.info(f"Resuming from state: ERA offset={state.get('era_offset', 0)}, "
                 f"EMPC offset={state.get('empc_offset', 0)}, "
                 f"NZLII={nzlii_st}")
    else:
        state = {"era_offset": 0, "empc_offset": 0, "nzlii": {},
                 "downloaded": [], "failed": [], "skipped": []}

    # Run scrapers
    all_stats = {}

    if args.source in ("era", "all"):
        all_stats["era"] = scrape_era(state, args.dry_run)

    if args.source in ("empc", "all"):
        all_stats["empc"] = scrape_empc(state, args.dry_run)

    if args.source in ("nzlii", "all"):
        all_stats["nzlii"] = scrape_nzlii(state, args.dry_run)

    # Final summary
    log.info("=" * 60)
    log.info("SCRAPING COMPLETE - SUMMARY")
    log.info("=" * 60)
    total_downloaded = sum(s.get("downloaded", 0) for s in all_stats.values())
    total_skipped = sum(s.get("skipped", 0) for s in all_stats.values())
    total_failed = sum(s.get("failed", 0) for s in all_stats.values())
    log.info(f"Total downloaded: {total_downloaded}")
    log.info(f"Total skipped: {total_skipped}")
    log.info(f"Total failed: {total_failed}")
    log.info(f"Final disk space: {check_disk_space():.1f} GB free")

    # Save final state
    save_state(state)


if __name__ == "__main__":
    main()
