ArchiveBox_ArchiveBox/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py

#!/usr/bin/env python3
"""
Archive a URL using SingleFile.

Usage: on_Snapshot__singlefile.py --url=<url> --snapshot-id=<uuid>
Output: Writes singlefile.html to $PWD

Environment variables:
    SINGLEFILE_ENABLED: Enable SingleFile archiving (default: True)
    SINGLEFILE_BINARY: Path to SingleFile binary (default: single-file)
    SINGLEFILE_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY)
    SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY) [unused; shared Chrome session required]
    SINGLEFILE_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
    SINGLEFILE_USER_AGENT: User agent string (x-fallback: USER_AGENT)
    SINGLEFILE_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
    SINGLEFILE_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
    SINGLEFILE_CHROME_ARGS: Chrome command-line arguments (x-fallback: CHROME_ARGS) [unused; shared Chrome session required]
    SINGLEFILE_ARGS: Default SingleFile arguments (JSON array)
    SINGLEFILE_ARGS_EXTRA: Extra arguments to append (JSON array)
"""

import json
import os
import subprocess
import sys
import threading
import time
from urllib.request import urlopen
from pathlib import Path
import shutil

import rich_click as click


# Extractor metadata
PLUGIN_NAME = 'singlefile'
BIN_NAME = 'single-file'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'singlefile.html'
EXTENSION_SAVE_SCRIPT = Path(__file__).parent / 'singlefile_extension_save.js'


def get_env(name: str, default: str = '') -> str:
    return os.environ.get(name, default).strip()


def get_env_bool(name: str, default: bool = False) -> bool:
    val = get_env(name, '').lower()
    if val in ('true', '1', 'yes', 'on'):
        return True
    if val in ('false', '0', 'no', 'off'):
        return False
    return default


def get_env_int(name: str, default: int = 0) -> int:
    try:
        return int(get_env(name, str(default)))
    except ValueError:
        return default


def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
    """Parse a JSON array from environment variable."""
    val = get_env(name, '')
    if not val:
        return default if default is not None else []
    try:
        result = json.loads(val)
        if isinstance(result, list):
            return [str(item) for item in result]
        return default if default is not None else []
    except json.JSONDecodeError:
        return default if default is not None else []


STATICFILE_DIR = '../staticfile'

def has_staticfile_output() -> bool:
    """Check if staticfile extractor already downloaded this URL."""
    staticfile_dir = Path(STATICFILE_DIR)
    if not staticfile_dir.exists():
        return False
    stdout_log = staticfile_dir / 'stdout.log'
    if not stdout_log.exists():
        return False
    for line in stdout_log.read_text(errors='ignore').splitlines():
        line = line.strip()
        if not line.startswith('{'):
            continue
        try:
            record = json.loads(line)
        except json.JSONDecodeError:
            continue
        if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded':
            return True
    return False


# Chrome session directory (relative to extractor output dir)
# Note: Chrome binary is obtained via CHROME_BINARY env var, not searched for.
# The centralized Chrome binary search is in chrome_utils.js findChromium().
CHROME_SESSION_DIR = '../chrome'


def get_cdp_url(wait_seconds: float = 0.0) -> str | None:
    """Get CDP URL from chrome plugin if available."""
    cdp_file = Path(CHROME_SESSION_DIR) / 'cdp_url.txt'
    deadline = time.time() + max(wait_seconds, 0.0)
    while True:
        if cdp_file.exists():
            cdp_url = cdp_file.read_text().strip()
            return cdp_url or None
        if time.time() >= deadline:
            return None
        time.sleep(0.2)


def get_port_from_cdp_url(cdp_url: str) -> str | None:
    """Extract port from CDP WebSocket URL (ws://127.0.0.1:PORT/...)."""
    import re
    match = re.search(r':(\d+)/', cdp_url)
    if match:
        return match.group(1)
    return None


def is_cdp_server_available(cdp_remote_url: str) -> bool:
    try:
        with urlopen(f'{cdp_remote_url}/json/version', timeout=1) as resp:
            return resp.status == 200
    except Exception:
        return False


def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
    """
    Archive URL using SingleFile.

    Requires a Chrome session (from chrome plugin) and connects to it via CDP.

    Returns: (success, output_path, error_message)
    """
    print(f'[singlefile] CLI mode start url={url}', file=sys.stderr)
    # Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader)
    timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
    user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
    check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', True) if get_env('SINGLEFILE_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
    cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '')
    singlefile_args = get_env_array('SINGLEFILE_ARGS', [])
    singlefile_args_extra = get_env_array('SINGLEFILE_ARGS_EXTRA', [])
    # Chrome args/binary are intentionally ignored because we require a shared Chrome session

    cmd = [binary, *singlefile_args]

    # Try to use existing Chrome session via CDP (prefer HTTP base URL)
    cdp_wait = min(10, max(1, timeout // 10))
    cdp_url = get_cdp_url(wait_seconds=cdp_wait)
    cdp_remote_url = None
    if cdp_url:
        if cdp_url.startswith(('http://', 'https://')):
            cdp_remote_url = cdp_url
        else:
            port = get_port_from_cdp_url(cdp_url)
            if port:
                cdp_remote_url = f'http://127.0.0.1:{port}'
            else:
                cdp_remote_url = cdp_url

    if cdp_remote_url and not is_cdp_server_available(cdp_remote_url):
        cdp_remote_url = None

    if cdp_remote_url:
        print(f'[singlefile] Using existing Chrome session: {cdp_remote_url}', file=sys.stderr)
        cmd.extend(['--browser-server', cdp_remote_url])
    else:
        return False, None, 'No Chrome session found (chrome plugin must run first)'

    # SSL handling
    if not check_ssl:
        cmd.append('--browser-ignore-insecure-certs')

    if user_agent:
        cmd.extend(['--user-agent', user_agent])

    if cookies_file and Path(cookies_file).is_file():
        cmd.extend(['--browser-cookies-file', cookies_file])

    # Add extra args from config
    if singlefile_args_extra:
        cmd.extend(singlefile_args_extra)

    # Output directory is current directory (hook already runs in output dir)
    output_dir = Path(OUTPUT_DIR)
    output_path = output_dir / OUTPUT_FILE

    cmd.extend([url, str(output_path)])
    print(f'[singlefile] CLI command: {" ".join(cmd[:6])} ...', file=sys.stderr)

    try:
        output_lines: list[str] = []
        process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1,
        )

        def _read_output() -> None:
            if not process.stdout:
                return
            for line in process.stdout:
                output_lines.append(line)
                sys.stderr.write(line)

        reader = threading.Thread(target=_read_output, daemon=True)
        reader.start()

        try:
            process.wait(timeout=timeout)
        except subprocess.TimeoutExpired:
            process.kill()
            reader.join(timeout=1)
            return False, None, f'Timed out after {timeout} seconds'

        reader.join(timeout=1)
        combined_output = ''.join(output_lines)

        if output_path.exists() and output_path.stat().st_size > 0:
            return True, str(output_path), ''
        else:
            stderr = combined_output
            if 'ERR_NAME_NOT_RESOLVED' in stderr:
                return False, None, 'DNS resolution failed'
            if 'ERR_CONNECTION_REFUSED' in stderr:
                return False, None, 'Connection refused'
            detail = (stderr or '').strip()
            if len(detail) > 2000:
                detail = detail[:2000]
            cmd_preview = list(cmd)
            if '--browser-args' in cmd_preview:
                idx = cmd_preview.index('--browser-args')
                if idx + 1 < len(cmd_preview):
                    cmd_preview[idx + 1] = '<json>'
            cmd_str = ' '.join(cmd_preview)
            return False, None, f'SingleFile failed (cmd={cmd_str}): {detail}'

    except subprocess.TimeoutExpired:
        return False, None, f'Timed out after {timeout} seconds'
    except Exception as e:
        return False, None, f'{type(e).__name__}: {e}'


def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | None, str]:
    """Save using the SingleFile Chrome extension via existing Chrome session."""
    print(f'[singlefile] Extension mode start url={url}', file=sys.stderr)
    # Only attempt if chrome session exists
    cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10)))
    if not cdp_url:
        print('[singlefile] No Chrome session found (chrome plugin must run first)', file=sys.stderr)
        return False, None, 'No Chrome session found (chrome plugin must run first)'

    if not EXTENSION_SAVE_SCRIPT.exists():
        print(f'[singlefile] Missing helper script: {EXTENSION_SAVE_SCRIPT}', file=sys.stderr)
        return False, None, 'SingleFile extension helper script missing'

    node_binary = get_env('SINGLEFILE_NODE_BINARY') or get_env('NODE_BINARY', 'node')
    downloads_dir = get_env('CHROME_DOWNLOADS_DIR', '')
    extensions_dir = get_env('CHROME_EXTENSIONS_DIR', '')
    cmd = [node_binary, str(EXTENSION_SAVE_SCRIPT), f'--url={url}']
    print(f'[singlefile] cdp_url={cdp_url}', file=sys.stderr)
    print(f'[singlefile] node={node_binary}', file=sys.stderr)
    node_resolved = shutil.which(node_binary) if node_binary else None
    print(f'[singlefile] node_resolved={node_resolved}', file=sys.stderr)
    print(f'[singlefile] PATH={os.environ.get("PATH","")}', file=sys.stderr)
    if downloads_dir:
        print(f'[singlefile] CHROME_DOWNLOADS_DIR={downloads_dir}', file=sys.stderr)
    if extensions_dir:
        print(f'[singlefile] CHROME_EXTENSIONS_DIR={extensions_dir}', file=sys.stderr)
    print(f'[singlefile] helper_cmd={" ".join(cmd)}', file=sys.stderr)

    try:
        output_lines: list[str] = []
        error_lines: list[str] = []
        process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            bufsize=1,
        )

        def _read_stream(stream, sink, label: str) -> None:
            if not stream:
                return
            for line in stream:
                sink.append(line)
                sys.stderr.write(line)
                sys.stderr.flush()

        stdout_thread = threading.Thread(target=_read_stream, args=(process.stdout, output_lines, 'stdout'), daemon=True)
        stderr_thread = threading.Thread(target=_read_stream, args=(process.stderr, error_lines, 'stderr'), daemon=True)
        stdout_thread.start()
        stderr_thread.start()

        try:
            process.wait(timeout=timeout)
        except subprocess.TimeoutExpired:
            process.kill()
            stdout_thread.join(timeout=1)
            stderr_thread.join(timeout=1)
            print(f'[singlefile] Extension helper timed out after {timeout}s', file=sys.stderr)
            return False, None, f'Timed out after {timeout} seconds'

        stdout_thread.join(timeout=1)
        stderr_thread.join(timeout=1)

        result_stdout = ''.join(output_lines).encode('utf-8', errors='replace')
        result_stderr = ''.join(error_lines).encode('utf-8', errors='replace')
        result_returncode = process.returncode
    except Exception as e:
        print(f'[singlefile] Extension helper error: {type(e).__name__}: {e}', file=sys.stderr)
        return False, None, f'{type(e).__name__}: {e}'

    print(f'[singlefile] helper_returncode={result_returncode}', file=sys.stderr)
    print(f'[singlefile] helper_stdout_len={len(result_stdout or b"")}', file=sys.stderr)
    print(f'[singlefile] helper_stderr_len={len(result_stderr or b"")}', file=sys.stderr)

    if result_returncode == 0:
        # Prefer explicit stdout path, fallback to local output file
        out_text = result_stdout.decode('utf-8', errors='replace').strip()
        if out_text and Path(out_text).exists():
            print(f'[singlefile] Extension output: {out_text}', file=sys.stderr)
            return True, out_text, ''
        output_path = Path(OUTPUT_DIR) / OUTPUT_FILE
        if output_path.exists() and output_path.stat().st_size > 0:
            print(f'[singlefile] Extension output: {output_path}', file=sys.stderr)
            return True, str(output_path), ''
        return False, None, 'SingleFile extension completed but no output file found'

    stderr = result_stderr.decode('utf-8', errors='replace').strip()
    stdout = result_stdout.decode('utf-8', errors='replace').strip()
    detail = stderr or stdout
    return False, None, detail or 'SingleFile extension failed'


@click.command()
@click.option('--url', required=True, help='URL to archive')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
    """Archive a URL using SingleFile."""

    print(f'[singlefile] Hook starting pid={os.getpid()} url={url}', file=sys.stderr)
    output = None
    status = 'failed'
    error = ''

    try:
        # Check if SingleFile is enabled
        if not get_env_bool('SINGLEFILE_ENABLED', True):
            print('Skipping SingleFile (SINGLEFILE_ENABLED=False)', file=sys.stderr)
            # Feature disabled - no ArchiveResult, just exit
            sys.exit(0)

        # Check if staticfile extractor already handled this (permanent skip)
        if has_staticfile_output():
            print('Skipping SingleFile - staticfile extractor already downloaded this', file=sys.stderr)
            print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
            sys.exit(0)

        # Prefer SingleFile extension via existing Chrome session
        timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
        success, output, error = save_singlefile_with_extension(url, timeout)
        status = 'succeeded' if success else 'failed'

    except Exception as e:
        error = f'{type(e).__name__}: {e}'
        status = 'failed'

    if error:
        print(f'ERROR: {error}', file=sys.stderr)

    # Output clean JSONL (no RESULT_JSON= prefix)
    result = {
        'type': 'ArchiveResult',
        'status': status,
        'output_str': output or error or '',
    }
    print(json.dumps(result))

    sys.exit(0 if status == 'succeeded' else 1)


if __name__ == '__main__':
    main()