diff --git a/archivebox/plugins/custom/on_Binary__14_custom_install.py b/archivebox/plugins/custom/on_Binary__14_custom_install.py index 7e523d54..47eea07f 100644 --- a/archivebox/plugins/custom/on_Binary__14_custom_install.py +++ b/archivebox/plugins/custom/on_Binary__14_custom_install.py @@ -44,12 +44,10 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c result = subprocess.run( custom_cmd, shell=True, - capture_output=True, - text=True, timeout=600, # 10 minute timeout for custom installs ) if result.returncode != 0: - click.echo(f"Custom install failed: {result.stderr}", err=True) + click.echo(f"Custom install failed (exit={result.returncode})", err=True) sys.exit(1) except subprocess.TimeoutExpired: click.echo("Custom install timed out", err=True) diff --git a/archivebox/plugins/git/on_Snapshot__05_git.bg.py b/archivebox/plugins/git/on_Snapshot__05_git.bg.py index 14ad7894..c124ddbe 100644 --- a/archivebox/plugins/git/on_Snapshot__05_git.bg.py +++ b/archivebox/plugins/git/on_Snapshot__05_git.bg.py @@ -82,13 +82,12 @@ def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]: cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR] try: - result = subprocess.run(cmd, capture_output=True, timeout=timeout) + result = subprocess.run(cmd, timeout=timeout) if result.returncode == 0 and Path(OUTPUT_DIR).is_dir(): return True, OUTPUT_DIR, '' else: - stderr = result.stderr.decode('utf-8', errors='replace') - return False, None, f'git clone failed: {stderr[:200]}' + return False, None, f'git clone failed (exit={result.returncode})' except subprocess.TimeoutExpired: return False, None, f'Timed out after {timeout} seconds' diff --git a/archivebox/plugins/mercury/on_Snapshot__57_mercury.py b/archivebox/plugins/mercury/on_Snapshot__57_mercury.py index 5b710711..a57c8933 100644 --- a/archivebox/plugins/mercury/on_Snapshot__57_mercury.py +++ b/archivebox/plugins/mercury/on_Snapshot__57_mercury.py @@ -81,11 +81,10 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: try: # Get text version cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text'] - result_text = subprocess.run(cmd_text, capture_output=True, timeout=timeout) + result_text = subprocess.run(cmd_text, stdout=subprocess.PIPE, timeout=timeout, text=True) if result_text.returncode != 0: - stderr = result_text.stderr.decode('utf-8', errors='replace') - return False, None, f'postlight-parser failed: {stderr[:200]}' + return False, None, f'postlight-parser failed (exit={result_text.returncode})' try: text_json = json.loads(result_text.stdout) @@ -101,7 +100,7 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: # Get HTML version cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html'] - result_html = subprocess.run(cmd_html, capture_output=True, timeout=timeout) + result_html = subprocess.run(cmd_html, stdout=subprocess.PIPE, timeout=timeout, text=True) try: html_json = json.loads(result_html.stdout) diff --git a/archivebox/plugins/pip/on_Binary__11_pip_install.py b/archivebox/plugins/pip/on_Binary__11_pip_install.py index 8737a042..468a2916 100644 --- a/archivebox/plugins/pip/on_Binary__11_pip_install.py +++ b/archivebox/plugins/pip/on_Binary__11_pip_install.py @@ -62,8 +62,6 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, override subprocess.run( [preferred_python, '-m', 'venv', str(pip_venv_path), '--upgrade-deps'], check=True, - capture_output=True, - text=True, ) except Exception: # Fall back to PipProvider-managed venv creation diff --git a/archivebox/plugins/readability/on_Snapshot__56_readability.py b/archivebox/plugins/readability/on_Snapshot__56_readability.py index 4c23fa28..06c8ee8b 100644 --- a/archivebox/plugins/readability/on_Snapshot__56_readability.py +++ b/archivebox/plugins/readability/on_Snapshot__56_readability.py @@ -107,11 +107,10 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]: try: # Run readability-extractor (outputs JSON by default) cmd = [binary, *readability_args, *readability_args_extra, html_source] - result = subprocess.run(cmd, capture_output=True, timeout=timeout) + result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=timeout, text=True) if result.returncode != 0: - stderr = result.stderr.decode('utf-8', errors='replace') - return False, None, f'readability-extractor failed: {stderr[:200]}' + return False, None, f'readability-extractor failed (exit={result.returncode})' # Parse JSON output try: diff --git a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py index aa73d69e..44362dc3 100644 --- a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py +++ b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py @@ -23,6 +23,7 @@ import json import os import subprocess import sys +import threading import time from urllib.request import urlopen from pathlib import Path @@ -200,18 +201,44 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: cmd.extend([url, str(output_path)]) try: - result = subprocess.run(cmd, capture_output=True, timeout=timeout) + output_lines: list[str] = [] + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + + def _read_output() -> None: + if not process.stdout: + return + for line in process.stdout: + output_lines.append(line) + sys.stderr.write(line) + + reader = threading.Thread(target=_read_output, daemon=True) + reader.start() + + try: + process.wait(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + reader.join(timeout=1) + return False, None, f'Timed out after {timeout} seconds' + + reader.join(timeout=1) + combined_output = ''.join(output_lines) if output_path.exists() and output_path.stat().st_size > 0: return True, str(output_path), '' else: - stderr = result.stderr.decode('utf-8', errors='replace') - stdout = result.stdout.decode('utf-8', errors='replace') + stderr = combined_output if 'ERR_NAME_NOT_RESOLVED' in stderr: return False, None, 'DNS resolution failed' if 'ERR_CONNECTION_REFUSED' in stderr: return False, None, 'Connection refused' - detail = (stderr or stdout).strip() + detail = (stderr or '').strip() if len(detail) > 2000: detail = detail[:2000] cmd_preview = list(cmd) diff --git a/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py b/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py index bf60ea58..3ebf22b2 100644 --- a/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py +++ b/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py @@ -144,7 +144,6 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]: try: result = subprocess.run( cmd, - capture_output=True, timeout=timeout * 2, # Allow extra time for large downloads ) @@ -155,18 +154,9 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]: ] if not downloaded_files: - stderr = result.stderr.decode('utf-8', errors='replace') - stdout = result.stdout.decode('utf-8', errors='replace') - combined = stderr + stdout - - if '403' in combined or 'Forbidden' in combined: - return False, None, '403 Forbidden (try changing USER_AGENT)' - elif '404' in combined or 'Not Found' in combined: - return False, None, '404 Not Found' - elif '500' in combined: - return False, None, '500 Internal Server Error' - else: - return False, None, f'No files downloaded: {stderr[:200]}' + if result.returncode != 0: + return False, None, f'wget failed (exit={result.returncode})' + return False, None, 'No files downloaded' # Find main HTML file html_files = [ diff --git a/pyproject.toml b/pyproject.toml index 3c12fffd..65983d51 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "archivebox" -version = "0.9.2" +version = "0.9.3" requires-python = ">=3.13" description = "Self-hosted internet archiving solution." authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}]