From bef67760dbf7b0d5f6cbe7bda76736a897177e44 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 19 Jan 2026 03:05:49 -0800 Subject: [PATCH] working singlefile --- archivebox/machine/models.py | 14 +- archivebox/misc/progress_layout.py | 94 +++++++- ...py => on_Snapshot__13_archivedotorg.bg.py} | 2 +- archivebox/plugins/chrome/chrome_utils.js | 41 +++- .../chrome/on_Snapshot__10_chrome_tab.bg.js | 11 +- ...vicon.py => on_Snapshot__11_favicon.bg.py} | 2 +- .../forumdl/on_Snapshot__04_forumdl.bg.py | 30 ++- .../gallerydl/on_Snapshot__03_gallerydl.bg.py | 2 +- .../mercury/on_Snapshot__57_mercury.py | 6 + .../on_Snapshot__56_readability.py | 4 + .../on_Crawl__82_singlefile_install.js | 14 +- .../singlefile/on_Snapshot__50_singlefile.py | 82 ++++++- .../singlefile/singlefile_extension_save.js | 207 ++++++++++++++++++ .../plugins/ublock/tests/test_ublock.py | 3 +- .../plugins/wget/on_Snapshot__06_wget.bg.py | 5 +- .../plugins/ytdlp/on_Snapshot__02_ytdlp.bg.py | 2 +- archivebox/workers/orchestrator.py | 33 ++- 17 files changed, 498 insertions(+), 54 deletions(-) rename archivebox/plugins/archivedotorg/{on_Snapshot__13_archivedotorg.py => on_Snapshot__13_archivedotorg.bg.py} (98%) rename archivebox/plugins/favicon/{on_Snapshot__11_favicon.py => on_Snapshot__11_favicon.bg.py} (98%) create mode 100644 archivebox/plugins/singlefile/singlefile_extension_save.js diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 210452f9..b63845ac 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -1020,14 +1020,14 @@ class Process(models.Model): # Debug logging import sys - print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr) + # print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr) # Get parent process start time from OS try: os_parent = psutil.Process(ppid) os_parent_start = os_parent.create_time() except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): - print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr) + # print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr) return None # Parent process doesn't exist # Find matching Process record @@ -1038,18 +1038,18 @@ class Process(models.Model): started_at__gte=timezone.now() - PID_REUSE_WINDOW, ).order_by('-started_at') - print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr) + # print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr) for candidate in candidates: if candidate.started_at: db_start_time = candidate.started_at.timestamp() time_diff = abs(db_start_time - os_parent_start) - print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr) + # print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr) if time_diff < START_TIME_TOLERANCE: - print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr) + # print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr) return candidate - print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr) + # print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr) return None # No matching ArchiveBox parent process @classmethod @@ -1519,7 +1519,7 @@ class Process(models.Model): stdout_path = self.stdout_file stderr_path = self.stderr_file - with open(stdout_path, 'w') as out, open(stderr_path, 'w') as err: + with open(stdout_path, 'a') as out, open(stderr_path, 'a') as err: proc = subprocess.Popen( self.cmd, cwd=working_dir, diff --git a/archivebox/misc/progress_layout.py b/archivebox/misc/progress_layout.py index e406d7d4..eb6fdb3a 100644 --- a/archivebox/misc/progress_layout.py +++ b/archivebox/misc/progress_layout.py @@ -10,6 +10,7 @@ Shows a comprehensive dashboard with: __package__ = 'archivebox.misc' from datetime import datetime, timezone +import os import re from typing import List, Optional, Any from collections import deque @@ -23,6 +24,7 @@ from rich.panel import Panel from rich.text import Text from rich.table import Table from rich.tree import Tree +from rich.cells import cell_len from archivebox.config import VERSION @@ -533,7 +535,23 @@ class CrawlQueueTreePanel: is_pending = hook.get('is_pending', False) icon, color = self._hook_style(status, is_bg=is_bg, is_running=is_running, is_pending=is_pending) stats = self._hook_stats(size=size, elapsed=elapsed, timeout=timeout, status=status) - snap_node.add(Text(f"{icon} {path}{stats}", style=color)) + line = Text(f"{icon} {path}{stats}", style=color) + stderr_tail = hook.get('stderr', '') + if stderr_tail: + left_str = f"{icon} {path}{stats}" + avail = self._available_width(left_str, indent=16) + trunc = getattr(self, "_truncate_tail", self._truncate_to_width) + stderr_tail = trunc(stderr_tail, avail) + if not stderr_tail: + snap_node.add(line) + continue + row = Table.grid(expand=True) + row.add_column(justify="left", ratio=1) + row.add_column(justify="right") + row.add_row(line, Text(stderr_tail, style="grey70")) + snap_node.add(row) + else: + snap_node.add(line) trees.append(crawl_tree) content = Group(*trees) @@ -561,7 +579,7 @@ class CrawlQueueTreePanel: if status == 'succeeded': return '✅', 'green' if status == 'failed': - return '⚠️', 'yellow' + return '✖', 'red' if status == 'skipped': return '⏭', 'grey53' if is_pending: @@ -595,6 +613,37 @@ class CrawlQueueTreePanel: return f" ({size_part} | {time_part})" if time_part else f" ({size_part})" return '' + @staticmethod + def _terminal_width() -> int: + try: + return os.get_terminal_size().columns + except OSError: + return 120 + + @staticmethod + def _truncate_to_width(text: str, max_width: int) -> str: + if not text or max_width <= 0: + return '' + t = Text(text) + t.truncate(max_width, overflow="ellipsis") + return t.plain + + @staticmethod + def _truncate_tail(text: str, max_width: int) -> str: + if not text or max_width <= 0: + return '' + if cell_len(text) <= max_width: + return text + if max_width <= 1: + return '…' + return f"…{text[-(max_width - 1):]}" + + def _available_width(self, left_text: str, indent: int = 0) -> int: + width = self._terminal_width() + base = max(0, width - cell_len(left_text) - indent - 6) + cap = max(0, (width * 2) // 5) + return max(0, min(base, cap)) + class ArchiveBoxProgressLayout: """ @@ -631,7 +680,7 @@ class ArchiveBoxProgressLayout: # Top-level split: crawl_queue, crawl_tree, processes layout.split( Layout(name="crawl_queue", size=3), - Layout(name="crawl_tree", size=14), + Layout(name="crawl_tree", size=20), Layout(name="processes", ratio=1), ) @@ -671,6 +720,8 @@ class ArchiveBoxProgressLayout: cmd = getattr(process, 'cmd', []) hook_path = Path(cmd[1]) if len(cmd) > 1 else None hook_name = hook_path.name if hook_path else '' + if '.bg.' in hook_name: + continue if '.bg.' not in hook_name: fg_running = True break @@ -684,6 +735,8 @@ class ArchiveBoxProgressLayout: cmd = getattr(process, 'cmd', []) hook_path = Path(cmd[1]) if len(cmd) > 1 else None hook_name = hook_path.name if hook_path else '' + if '.bg.' in hook_name: + continue if '.bg.' not in hook_name: fg_pending = True break @@ -701,6 +754,10 @@ class ArchiveBoxProgressLayout: is_bg = '.bg.' in hook_name except Exception: is_bg = False + if is_hook and is_bg: + continue + if not self._has_log_lines(process): + continue is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None)) max_lines = 2 if is_pending else (4 if is_bg else 7) panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg, bg_terminating=bg_terminating)) @@ -718,6 +775,17 @@ class ArchiveBoxProgressLayout: def update_crawl_tree(self, crawls: list[dict[str, Any]]) -> None: """Update the crawl queue tree panel.""" self.crawl_queue_tree.update_crawls(crawls) + # Auto-size crawl tree panel to content + line_count = 0 + for crawl in crawls: + line_count += 1 + for snap in crawl.get('snapshots', []) or []: + line_count += 1 + if snap.get('output_path'): + line_count += 1 + for _ in snap.get('hooks', []) or []: + line_count += 1 + self.layout["crawl_tree"].size = max(4, line_count + 2) def log_event(self, message: str, style: str = "white") -> None: """Add an event to the orchestrator log.""" @@ -767,8 +835,28 @@ class ArchiveBoxProgressLayout: timeout=hook.get('timeout', ''), status=status, ) + stderr_tail = hook.get('stderr', '') hook_line = f" {icon} {path}{stats}".strip() + if stderr_tail: + avail = self.crawl_queue_tree._available_width(hook_line, indent=16) + trunc = getattr(self.crawl_queue_tree, "_truncate_tail", self.crawl_queue_tree._truncate_to_width) + stderr_tail = trunc(stderr_tail, avail) + if stderr_tail: + hook_line = f"{hook_line} {stderr_tail}" if hook_line: lines.append(("crawl_tree", hook_line)) return lines + + @staticmethod + def _has_log_lines(process: Any) -> bool: + try: + stdout_lines = list(process.tail_stdout(lines=1, follow=False)) + if any(line.strip() for line in stdout_lines): + return True + stderr_lines = list(process.tail_stderr(lines=1, follow=False)) + if any(line.strip() for line in stderr_lines): + return True + except Exception: + return False + return False diff --git a/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py b/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.bg.py similarity index 98% rename from archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py rename to archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.bg.py index 36522417..11642b24 100644 --- a/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py +++ b/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.bg.py @@ -2,7 +2,7 @@ """ Submit a URL to archive.org for archiving. -Usage: on_Snapshot__archivedotorg.py --url= --snapshot-id= +Usage: on_Snapshot__archivedotorg.bg.py --url= --snapshot-id= Output: Writes archive.org.txt to $PWD with the archived URL Environment variables: diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index 4de49341..df43115f 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -803,9 +803,16 @@ try { * @returns {string} - 32-character extension ID */ function getExtensionId(unpacked_path) { + let resolved_path = unpacked_path; + try { + resolved_path = fs.realpathSync(unpacked_path); + } catch (err) { + // Use the provided path if realpath fails + resolved_path = unpacked_path; + } // Chrome uses a SHA256 hash of the unpacked extension directory path const hash = crypto.createHash('sha256'); - hash.update(Buffer.from(unpacked_path, 'utf-8')); + hash.update(Buffer.from(resolved_path, 'utf-8')); // Convert first 32 hex chars to characters in the range 'a'-'p' const detected_extension_id = Array.from(hash.digest('hex')) @@ -978,6 +985,8 @@ async function isTargetExtension(target) { let extension_id = null; let manifest_version = null; + let manifest = null; + let manifest_name = null; const target_is_extension = is_chrome_extension || target_is_bg; if (target_is_extension) { @@ -985,8 +994,9 @@ async function isTargetExtension(target) { extension_id = target_url?.split('://')[1]?.split('/')[0] || null; if (target_ctx) { - const manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest()); + manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest()); manifest_version = manifest?.manifest_version || null; + manifest_name = manifest?.name || null; } } catch (err) { // Failed to get extension metadata @@ -1001,6 +1011,8 @@ async function isTargetExtension(target) { target_url, extension_id, manifest_version, + manifest, + manifest_name, }; } @@ -1053,14 +1065,23 @@ async function loadExtensionFromTarget(extensions, target) { // Trigger extension toolbar button click dispatchAction: async (tab) => { - return await target_ctx.evaluate((tabId) => { - return new Promise((resolve) => { - chrome.action.onClicked.addListener((tab) => { - resolve({ success: true, tab }); - }); - chrome.action.openPopup(); - }); - }, tab?.id || null); + return await target_ctx.evaluate(async (tab) => { + tab = tab || (await new Promise((resolve) => + chrome.tabs.query({ currentWindow: true, active: true }, ([tab]) => resolve(tab)) + )); + + // Manifest V3: chrome.action + if (chrome.action?.onClicked?.dispatch) { + return await chrome.action.onClicked.dispatch(tab); + } + + // Manifest V2: chrome.browserAction + if (chrome.browserAction?.onClicked?.dispatch) { + return await chrome.browserAction.onClicked.dispatch(tab); + } + + throw new Error('Extension action dispatch not available'); + }, tab || null); }, // Send message to extension diff --git a/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js b/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js index fca4acdc..ca8e8232 100755 --- a/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js +++ b/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js @@ -118,9 +118,7 @@ process.on('SIGTERM', () => cleanup('SIGTERM')); process.on('SIGINT', () => cleanup('SIGINT')); // Try to find the crawl's Chrome session -function findCrawlChromeSession(crawlId) { - if (!crawlId) return null; - +function findCrawlChromeSession() { // Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', ''); if (!crawlOutputDir) return null; @@ -301,7 +299,7 @@ async function main() { const args = parseArgs(); const url = args.url; const snapshotId = args.snapshot_id; - const crawlId = args.crawl_id; + const crawlId = args.crawl_id || getEnv('CRAWL_ID', ''); if (!url || !snapshotId) { console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url= --snapshot-id= [--crawl-id=]'); @@ -332,15 +330,14 @@ async function main() { } // Try to use existing crawl Chrome session - const crawlSession = findCrawlChromeSession(crawlId); + const crawlSession = findCrawlChromeSession(); let result; if (crawlSession) { console.log(`[*] Found existing Chrome session from crawl ${crawlId}`); result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid); } else { - console.log(`[*] No crawl Chrome session found, launching new Chrome`); - result = await launchNewChrome(url, binary); + result = { success: false, error: 'No crawl Chrome session found (CRAWL_OUTPUT_DIR missing or chrome not running)' }; } if (result.success) { diff --git a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py b/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py similarity index 98% rename from archivebox/plugins/favicon/on_Snapshot__11_favicon.py rename to archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py index 4b40d726..cb62dfe3 100644 --- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py +++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py @@ -2,7 +2,7 @@ """ Extract favicon from a URL. -Usage: on_Snapshot__favicon.py --url= --snapshot-id= +Usage: on_Snapshot__favicon.bg.py --url= --snapshot-id= Output: Writes favicon.ico to $PWD Environment variables: diff --git a/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py b/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py index 9d2c2461..d19e7e16 100755 --- a/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py +++ b/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py @@ -17,6 +17,7 @@ Environment variables: import json import os +import shutil import subprocess import sys import threading @@ -87,6 +88,27 @@ def get_env_array(name: str, default: list[str] | None = None) -> list[str]: return default if default is not None else [] +def get_binary_shebang(binary_path: str) -> str | None: + """Return interpreter from shebang line if present (e.g., /path/to/python).""" + try: + with open(binary_path, 'r', encoding='utf-8') as f: + first_line = f.readline().strip() + if first_line.startswith('#!'): + return first_line[2:].strip().split(' ')[0] + except Exception: + pass + return None + + +def resolve_binary_path(binary: str) -> str | None: + """Resolve binary to an absolute path if possible.""" + if not binary: + return None + if Path(binary).is_file(): + return binary + return shutil.which(binary) + + def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: """ @@ -118,10 +140,12 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: # Use our Pydantic v2 compatible wrapper if available, otherwise fall back to binary wrapper_path = Path(__file__).parent / 'forum-dl-wrapper.py' + resolved_binary = resolve_binary_path(binary) or binary if wrapper_path.exists(): - cmd = [sys.executable, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)] + forumdl_python = get_binary_shebang(resolved_binary) or sys.executable + cmd = [forumdl_python, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)] else: - cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)] + cmd = [resolved_binary, *forumdl_args, '-f', output_format, '-o', str(output_file)] if not check_ssl: cmd.append('--no-check-certificate') @@ -187,7 +211,7 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: if 'unable to extract' in stderr_lower: return False, None, 'Unable to extract forum info' - return False, None, f'forum-dl error: {stderr[:200]}' + return False, None, f'forum-dl error: {stderr}' except subprocess.TimeoutExpired: return False, None, f'Timed out after {timeout} seconds' diff --git a/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py b/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py index d4c2a08d..fc5d951c 100755 --- a/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py +++ b/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py @@ -196,7 +196,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: if 'unable to extract' in stderr_lower: return False, None, 'Unable to extract gallery info' - return False, None, f'gallery-dl error: {stderr[:200]}' + return False, None, f'gallery-dl error: {stderr}' except subprocess.TimeoutExpired: return False, None, f'Timed out after {timeout} seconds' diff --git a/archivebox/plugins/mercury/on_Snapshot__57_mercury.py b/archivebox/plugins/mercury/on_Snapshot__57_mercury.py index a57c8933..b131c14c 100644 --- a/archivebox/plugins/mercury/on_Snapshot__57_mercury.py +++ b/archivebox/plugins/mercury/on_Snapshot__57_mercury.py @@ -82,6 +82,9 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: # Get text version cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text'] result_text = subprocess.run(cmd_text, stdout=subprocess.PIPE, timeout=timeout, text=True) + if result_text.stdout: + sys.stderr.write(result_text.stdout) + sys.stderr.flush() if result_text.returncode != 0: return False, None, f'postlight-parser failed (exit={result_text.returncode})' @@ -101,6 +104,9 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: # Get HTML version cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html'] result_html = subprocess.run(cmd_html, stdout=subprocess.PIPE, timeout=timeout, text=True) + if result_html.stdout: + sys.stderr.write(result_html.stdout) + sys.stderr.flush() try: html_json = json.loads(result_html.stdout) diff --git a/archivebox/plugins/readability/on_Snapshot__56_readability.py b/archivebox/plugins/readability/on_Snapshot__56_readability.py index 06c8ee8b..bf7a758f 100644 --- a/archivebox/plugins/readability/on_Snapshot__56_readability.py +++ b/archivebox/plugins/readability/on_Snapshot__56_readability.py @@ -109,6 +109,10 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]: cmd = [binary, *readability_args, *readability_args_extra, html_source] result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=timeout, text=True) + if result.stdout: + sys.stderr.write(result.stdout) + sys.stderr.flush() + if result.returncode != 0: return False, None, f'readability-extractor failed (exit={result.returncode})' diff --git a/archivebox/plugins/singlefile/on_Crawl__82_singlefile_install.js b/archivebox/plugins/singlefile/on_Crawl__82_singlefile_install.js index 9e89f9be..8abefe4f 100755 --- a/archivebox/plugins/singlefile/on_Crawl__82_singlefile_install.js +++ b/archivebox/plugins/singlefile/on_Crawl__82_singlefile_install.js @@ -116,7 +116,19 @@ async function saveSinglefileWithExtension(page, extension, options = {}) { // Trigger the extension's action (toolbar button click) console.error('[singlefile] Dispatching extension action...'); - await extension.dispatchAction(); + try { + const actionTimeoutMs = options.actionTimeoutMs || 5000; + const actionPromise = extension.dispatchAction(); + const actionResult = await Promise.race([ + actionPromise, + wait(actionTimeoutMs).then(() => 'timeout'), + ]); + if (actionResult === 'timeout') { + console.error(`[singlefile] Extension action did not resolve within ${actionTimeoutMs}ms, continuing...`); + } + } catch (err) { + console.error(`[singlefile] Extension action error: ${err.message || err}`); + } // Wait for file to appear in downloads directory const check_delay = 3000; // 3 seconds diff --git a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py index 9ac58f51..3590c793 100644 --- a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py +++ b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py @@ -27,6 +27,7 @@ import threading import time from urllib.request import urlopen from pathlib import Path +import shutil import rich_click as click @@ -142,6 +143,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ + print(f'[singlefile] CLI mode start url={url}', file=sys.stderr) # Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader) timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120) user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '') @@ -172,8 +174,10 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: cdp_remote_url = None if cdp_remote_url: + print(f'[singlefile] Using existing Chrome session: {cdp_remote_url}', file=sys.stderr) cmd.extend(['--browser-server', cdp_remote_url]) elif chrome: + print(f'[singlefile] Launching Chrome binary: {chrome}', file=sys.stderr) cmd.extend(['--browser-executable-path', chrome]) # Pass Chrome arguments (only when launching a new browser) @@ -200,6 +204,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: output_path = output_dir / OUTPUT_FILE cmd.extend([url, str(output_path)]) + print(f'[singlefile] CLI command: {" ".join(cmd[:6])} ...', file=sys.stderr) try: output_lines: list[str] = [] @@ -258,36 +263,93 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | None, str]: """Save using the SingleFile Chrome extension via existing Chrome session.""" + print(f'[singlefile] Extension mode start url={url}', file=sys.stderr) # Only attempt if chrome session exists cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10))) if not cdp_url: + print('[singlefile] No chrome session (cdp_url.txt missing)', file=sys.stderr) return False, None, 'No Chrome session available' if not EXTENSION_SAVE_SCRIPT.exists(): + print(f'[singlefile] Missing helper script: {EXTENSION_SAVE_SCRIPT}', file=sys.stderr) return False, None, 'SingleFile extension helper script missing' node_binary = get_env('SINGLEFILE_NODE_BINARY') or get_env('NODE_BINARY', 'node') + downloads_dir = get_env('CHROME_DOWNLOADS_DIR', '') + extensions_dir = get_env('CHROME_EXTENSIONS_DIR', '') cmd = [node_binary, str(EXTENSION_SAVE_SCRIPT), f'--url={url}'] + print(f'[singlefile] cdp_url={cdp_url}', file=sys.stderr) + print(f'[singlefile] node={node_binary}', file=sys.stderr) + node_resolved = shutil.which(node_binary) if node_binary else None + print(f'[singlefile] node_resolved={node_resolved}', file=sys.stderr) + print(f'[singlefile] PATH={os.environ.get("PATH","")}', file=sys.stderr) + if downloads_dir: + print(f'[singlefile] CHROME_DOWNLOADS_DIR={downloads_dir}', file=sys.stderr) + if extensions_dir: + print(f'[singlefile] CHROME_EXTENSIONS_DIR={extensions_dir}', file=sys.stderr) + print(f'[singlefile] helper_cmd={" ".join(cmd)}', file=sys.stderr) try: - result = subprocess.run(cmd, capture_output=True, timeout=timeout) - except subprocess.TimeoutExpired: - return False, None, f'Timed out after {timeout} seconds' + output_lines: list[str] = [] + error_lines: list[str] = [] + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + bufsize=1, + ) + + def _read_stream(stream, sink, label: str) -> None: + if not stream: + return + for line in stream: + sink.append(line) + sys.stderr.write(line) + sys.stderr.flush() + + stdout_thread = threading.Thread(target=_read_stream, args=(process.stdout, output_lines, 'stdout'), daemon=True) + stderr_thread = threading.Thread(target=_read_stream, args=(process.stderr, error_lines, 'stderr'), daemon=True) + stdout_thread.start() + stderr_thread.start() + + try: + process.wait(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + stdout_thread.join(timeout=1) + stderr_thread.join(timeout=1) + print(f'[singlefile] Extension helper timed out after {timeout}s', file=sys.stderr) + return False, None, f'Timed out after {timeout} seconds' + + stdout_thread.join(timeout=1) + stderr_thread.join(timeout=1) + + result_stdout = ''.join(output_lines).encode('utf-8', errors='replace') + result_stderr = ''.join(error_lines).encode('utf-8', errors='replace') + result_returncode = process.returncode except Exception as e: + print(f'[singlefile] Extension helper error: {type(e).__name__}: {e}', file=sys.stderr) return False, None, f'{type(e).__name__}: {e}' - if result.returncode == 0: + print(f'[singlefile] helper_returncode={result_returncode}', file=sys.stderr) + print(f'[singlefile] helper_stdout_len={len(result_stdout or b"")}', file=sys.stderr) + print(f'[singlefile] helper_stderr_len={len(result_stderr or b"")}', file=sys.stderr) + + if result_returncode == 0: # Prefer explicit stdout path, fallback to local output file - out_text = result.stdout.decode('utf-8', errors='replace').strip() + out_text = result_stdout.decode('utf-8', errors='replace').strip() if out_text and Path(out_text).exists(): + print(f'[singlefile] Extension output: {out_text}', file=sys.stderr) return True, out_text, '' output_path = Path(OUTPUT_DIR) / OUTPUT_FILE if output_path.exists() and output_path.stat().st_size > 0: + print(f'[singlefile] Extension output: {output_path}', file=sys.stderr) return True, str(output_path), '' return False, None, 'SingleFile extension completed but no output file found' - stderr = result.stderr.decode('utf-8', errors='replace').strip() - stdout = result.stdout.decode('utf-8', errors='replace').strip() + stderr = result_stderr.decode('utf-8', errors='replace').strip() + stdout = result_stdout.decode('utf-8', errors='replace').strip() detail = stderr or stdout return False, None, detail or 'SingleFile extension failed' @@ -298,6 +360,7 @@ def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | def main(url: str, snapshot_id: str): """Archive a URL using SingleFile.""" + print(f'[singlefile] Hook starting pid={os.getpid()} url={url}', file=sys.stderr) output = None status = 'failed' error = '' @@ -318,11 +381,6 @@ def main(url: str, snapshot_id: str): # Prefer SingleFile extension via existing Chrome session timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120) success, output, error = save_singlefile_with_extension(url, timeout) - - # Fallback to single-file-cli if extension path failed - if not success: - binary = get_env('SINGLEFILE_BINARY', 'single-file') - success, output, error = save_singlefile(url, binary) status = 'succeeded' if success else 'failed' except Exception as e: diff --git a/archivebox/plugins/singlefile/singlefile_extension_save.js b/archivebox/plugins/singlefile/singlefile_extension_save.js new file mode 100644 index 00000000..7bb8138e --- /dev/null +++ b/archivebox/plugins/singlefile/singlefile_extension_save.js @@ -0,0 +1,207 @@ +#!/usr/bin/env node +/** + * Save a page using the SingleFile Chrome extension via an existing Chrome session. + * + * Usage: singlefile_extension_save.js --url= + * Output: prints saved file path on success + */ + +const fs = require('fs'); +const path = require('path'); + +const CHROME_SESSION_DIR = '../chrome'; +const DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR || + path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads'); + +process.env.CHROME_DOWNLOADS_DIR = DOWNLOADS_DIR; + +async function setDownloadDir(page, downloadDir) { + try { + await fs.promises.mkdir(downloadDir, { recursive: true }); + const client = await page.target().createCDPSession(); + try { + await client.send('Page.setDownloadBehavior', { + behavior: 'allow', + downloadPath: downloadDir, + }); + } catch (err) { + // Fallback for newer protocol versions + await client.send('Browser.setDownloadBehavior', { + behavior: 'allow', + downloadPath: downloadDir, + }); + } + } catch (err) { + console.error(`[⚠️] Failed to set download directory: ${err.message || err}`); + } +} + +function parseArgs() { + const args = {}; + process.argv.slice(2).forEach((arg) => { + if (arg.startsWith('--')) { + const [key, ...valueParts] = arg.slice(2).split('='); + args[key.replace(/-/g, '_')] = valueParts.join('=') || true; + } + }); + return args; +} + +async function main() { + const args = parseArgs(); + const url = args.url; + + if (!url) { + console.error('Usage: singlefile_extension_save.js --url='); + process.exit(1); + } + + console.error(`[singlefile] helper start url=${url}`); + console.error(`[singlefile] downloads_dir=${DOWNLOADS_DIR}`); + if (process.env.CHROME_EXTENSIONS_DIR) { + console.error(`[singlefile] extensions_dir=${process.env.CHROME_EXTENSIONS_DIR}`); + } + + try { + console.error('[singlefile] loading dependencies...'); + const puppeteer = require('puppeteer-core'); + const chromeUtils = require('../chrome/chrome_utils.js'); + const { + EXTENSION, + saveSinglefileWithExtension, + } = require('./on_Crawl__82_singlefile_install.js'); + console.error('[singlefile] dependencies loaded'); + + // Ensure extension is installed and metadata is cached + console.error('[singlefile] ensuring extension cache...'); + const extension = await chromeUtils.installExtensionWithCache( + EXTENSION, + { extensionsDir: process.env.CHROME_EXTENSIONS_DIR } + ); + if (!extension) { + console.error('[❌] SingleFile extension not installed'); + process.exit(2); + } + if (extension.unpacked_path) { + const runtimeId = chromeUtils.getExtensionId(extension.unpacked_path); + if (runtimeId) { + extension.id = runtimeId; + } + } + console.error(`[singlefile] extension ready id=${extension.id} version=${extension.version}`); + + // Connect to existing Chrome session + console.error('[singlefile] connecting to chrome session...'); + const { browser, page } = await chromeUtils.connectToPage({ + chromeSessionDir: CHROME_SESSION_DIR, + timeoutMs: 60000, + puppeteer, + }); + console.error('[singlefile] connected to chrome'); + + try { + // Ensure CDP target discovery is enabled so service_worker targets appear + try { + const client = await page.createCDPSession(); + await client.send('Target.setDiscoverTargets', { discover: true }); + await client.send('Target.setAutoAttach', { autoAttach: true, waitForDebuggerOnStart: false, flatten: true }); + } catch (err) { + console.error(`[singlefile] failed to enable target discovery: ${err.message || err}`); + } + + // Wait for extension target to be available, then attach dispatchAction + console.error('[singlefile] waiting for extension target...'); + const deadline = Date.now() + 30000; + let matchTarget = null; + let matchInfo = null; + let lastLog = 0; + const wantedName = (extension.name || 'singlefile').toLowerCase(); + + while (Date.now() < deadline && !matchTarget) { + const targets = browser.targets(); + for (const target of targets) { + const info = await chromeUtils.isTargetExtension(target); + if (!info?.target_is_extension || !info?.extension_id) { + continue; + } + const manifestName = (info.manifest_name || '').toLowerCase(); + const targetUrl = (info.target_url || '').toLowerCase(); + const nameMatches = manifestName.includes(wantedName) || manifestName.includes('singlefile') || manifestName.includes('single-file'); + const urlMatches = targetUrl.includes('singlefile') || targetUrl.includes('single-file') || targetUrl.includes('single-file-extension'); + if (nameMatches || urlMatches) { + matchTarget = target; + matchInfo = info; + break; + } + } + + if (!matchTarget) { + if (Date.now() - lastLog > 5000) { + const targetsSummary = []; + for (const target of targets) { + const info = await chromeUtils.isTargetExtension(target); + if (!info?.target_is_extension) { + continue; + } + targetsSummary.push({ + type: info.target_type, + url: info.target_url, + extensionId: info.extension_id, + manifestName: info.manifest_name, + }); + } + console.error(`[singlefile] waiting... targets total=${targets.length} extensions=${targetsSummary.length} details=${JSON.stringify(targetsSummary)}`); + lastLog = Date.now(); + } + await new Promise(r => setTimeout(r, 500)); + } + } + + if (!matchTarget || !matchInfo) { + const targets = chromeUtils.getExtensionTargets(browser); + console.error(`[singlefile] extension target not found (name=${extension.name})`); + console.error(`[singlefile] available targets: ${JSON.stringify(targets)}`); + await browser.disconnect(); + process.exit(5); + } + + // Use the runtime extension id from the matched target + extension.id = matchInfo.extension_id; + + console.error('[singlefile] loading extension from target...'); + await chromeUtils.loadExtensionFromTarget([extension], matchTarget); + if (typeof extension.dispatchAction !== 'function') { + const targets = chromeUtils.getExtensionTargets(browser); + console.error(`[singlefile] extension dispatchAction missing for id=${extension.id}`); + console.error(`[singlefile] available targets: ${JSON.stringify(targets)}`); + await browser.disconnect(); + process.exit(6); + } + console.error('[singlefile] setting download dir...'); + await setDownloadDir(page, DOWNLOADS_DIR); + + console.error('[singlefile] triggering save via extension...'); + const output = await saveSinglefileWithExtension(page, extension, { downloadsDir: DOWNLOADS_DIR }); + if (output && fs.existsSync(output)) { + console.error(`[singlefile] saved: ${output}`); + console.log(output); + await browser.disconnect(); + process.exit(0); + } + + console.error('[❌] SingleFile extension did not produce output'); + await browser.disconnect(); + process.exit(3); + } catch (err) { + await browser.disconnect(); + throw err; + } + } catch (err) { + console.error(`[❌] ${err.message || err}`); + process.exit(4); + } +} + +if (require.main === module) { + main(); +} diff --git a/archivebox/plugins/ublock/tests/test_ublock.py b/archivebox/plugins/ublock/tests/test_ublock.py index 5489739d..debea7f3 100644 --- a/archivebox/plugins/ublock/tests/test_ublock.py +++ b/archivebox/plugins/ublock/tests/test_ublock.py @@ -483,8 +483,7 @@ const puppeteer = require('puppeteer-core'); result = subprocess.run( ['node', str(script_path)], - cwd=str(tmpdir, - env=get_test_env()), + cwd=str(tmpdir), capture_output=True, text=True, env=env, diff --git a/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py b/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py index 3ebf22b2..f62b21b5 100644 --- a/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py +++ b/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py @@ -144,6 +144,8 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]: try: result = subprocess.run( cmd, + capture_output=True, + text=True, timeout=timeout * 2, # Allow extra time for large downloads ) @@ -166,7 +168,8 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]: output_path = str(html_files[0]) if html_files else str(downloaded_files[0]) # Parse download stats from wget output - output_tail = result.stderr.decode('utf-8', errors='replace').strip().split('\n')[-3:] + stderr_text = (result.stderr or '') + output_tail = stderr_text.strip().split('\n')[-3:] if stderr_text else [] files_count = len(downloaded_files) return True, output_path, '' diff --git a/archivebox/plugins/ytdlp/on_Snapshot__02_ytdlp.bg.py b/archivebox/plugins/ytdlp/on_Snapshot__02_ytdlp.bg.py index 633765ef..fbf841ae 100644 --- a/archivebox/plugins/ytdlp/on_Snapshot__02_ytdlp.bg.py +++ b/archivebox/plugins/ytdlp/on_Snapshot__02_ytdlp.bg.py @@ -201,7 +201,7 @@ def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]: if 'Unable to extract' in stderr: return False, None, 'Unable to extract media info' - return False, None, f'yt-dlp error: {stderr[:200]}' + return False, None, f'yt-dlp error: {stderr}' except subprocess.TimeoutExpired: return False, None, f'Timed out after {timeout} seconds' diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index 614f8e0c..64f92824 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -459,7 +459,6 @@ class Orchestrator: # Enable progress layout only in TTY + foreground mode show_progress = IS_TTY and self.exit_on_idle plain_output = not IS_TTY - self.on_startup() if not show_progress: @@ -520,7 +519,6 @@ class Orchestrator: def _run_orchestrator_loop(self, progress_layout, plain_output: bool = False): """Run the main orchestrator loop with optional progress display.""" - last_queue_sizes = {} last_snapshot_count = None tick_count = 0 last_plain_lines: set[tuple[str, str]] = set() @@ -611,6 +609,21 @@ class Orchestrator: seconds = max(0.0, float(total_seconds)) return f"{seconds:.1f}s" + def _tail_stderr_line(proc) -> str: + try: + path = getattr(proc, 'stderr_file', None) + if not path or not path.exists(): + return '' + with open(path, 'rb') as f: + f.seek(0, os.SEEK_END) + size = f.tell() + f.seek(max(0, size - 4096)) + data = f.read().decode('utf-8', errors='ignore') + lines = [ln.strip() for ln in data.splitlines() if ln.strip()] + return lines[-1] if lines else '' + except Exception: + return '' + tree_data: list[dict] = [] for crawl in crawls: urls = crawl.get_urls_list() @@ -684,7 +697,10 @@ class Orchestrator: elapsed = '' timeout = '' size = '' + stderr_tail = '' if ar: + if ar.process_id and ar.process: + stderr_tail = _tail_stderr_line(ar.process) if ar.status == ArchiveResult.StatusChoices.STARTED: status = 'started' is_running = True @@ -700,6 +716,8 @@ class Orchestrator: timeout = _format_seconds(hook_timeout) else: status = ar.status + if ar.process_id and ar.process and ar.process.exit_code == 137: + status = 'failed' is_pending = False start_ts = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None) end_ts = ar.end_ts or (ar.process.ended_at if ar.process_id and ar.process else None) @@ -724,6 +742,7 @@ class Orchestrator: 'is_running': is_running, 'is_pending': is_pending, 'hook_name': hook_name, + 'stderr': stderr_tail, }) hooks = [] @@ -734,6 +753,7 @@ class Orchestrator: any_succeeded = any(h['status'] == ArchiveResult.StatusChoices.SUCCEEDED for h in hook_entries) any_skipped = any(h['status'] == ArchiveResult.StatusChoices.SKIPPED for h in hook_entries) + stderr_tail = '' if running: status = 'started' is_running = True @@ -741,6 +761,7 @@ class Orchestrator: is_bg = running['is_bg'] elapsed = running.get('elapsed', '') timeout = running.get('timeout', '') + stderr_tail = running.get('stderr', '') size = '' elif pending: status = 'pending' @@ -749,6 +770,7 @@ class Orchestrator: is_bg = pending['is_bg'] elapsed = pending.get('elapsed', '') or _format_seconds(0) timeout = pending.get('timeout', '') + stderr_tail = pending.get('stderr', '') size = '' else: is_running = False @@ -762,6 +784,10 @@ class Orchestrator: status = 'skipped' else: status = 'skipped' + for h in hook_entries: + if h.get('stderr'): + stderr_tail = h['stderr'] + break total_elapsed = 0.0 has_elapsed = False for h in hook_entries: @@ -793,6 +819,7 @@ class Orchestrator: 'is_bg': is_bg, 'is_running': is_running, 'is_pending': is_pending, + 'stderr': stderr_tail, }) snap_label = _abbrev(f"{str(snap.id)[-8:]} {snap.url or ''}".strip(), max_len=80) @@ -857,8 +884,6 @@ class Orchestrator: progress_layout.update_process_panels(running_processes, pending=pending_processes) - last_queue_sizes = queue_sizes.copy() - # Update snapshot progress from archivebox.core.models import Snapshot