From 7ceaeae2d9d052557bcae5ce88c9645b7055ef09 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 4 Jan 2026 22:38:15 -0800 Subject: [PATCH] rename archive_org to archivedotorg, add BinaryWorker, fix config pass-through --- .claude/settings.local.json | 17 + archivebox/base_models/models.py | 2 +- archivebox/cli/archivebox_run.py | 42 +- archivebox/config/configset.py | 26 +- archivebox/core/forms.py | 2 +- .../core/migrations/0007_archiveresult.py | 2 +- .../migrations/0011_auto_20210216_1331.py | 2 +- .../migrations/0021_auto_20220914_0934.py | 2 +- .../migrations/0022_auto_20231023_2008.py | 2 +- archivebox/core/models.py | 2 +- archivebox/core/settings.py | 5 +- .../0004_remove_crawl_output_dir.py | 17 + archivebox/hooks.py | 76 +-- .../0011_remove_binary_output_dir.py | 17 + archivebox/personas/apps.py | 2 +- .../personas/migrations/0001_initial.py | 3 +- .../migrations/0002_alter_persona_id.py | 19 + archivebox/personas/models.py | 2 + .../config.json | 0 .../on_Snapshot__13_archivedotorg.py} | 8 +- .../templates/icon.html | 0 .../templates/thumbnail.html | 0 .../tests/test_archivedotorg.py} | 8 +- .../screenshot/on_Snapshot__51_screenshot.js | 17 +- .../screenshot/tests/test_screenshot.py | 3 - archivebox/plugins/ytdlp/config.json | 2 +- .../tests/test_cli_run_binary_worker.py | 256 +++++++++ .../tests/test_worker_config_propagation.py | 508 ++++++++++++++++++ archivebox/workers/orchestrator.py | 63 ++- archivebox/workers/worker.py | 110 +++- bin/test_plugins.sh | 2 + old/TODO_hook_architecture.md | 4 +- 32 files changed, 1111 insertions(+), 110 deletions(-) create mode 100644 archivebox/crawls/migrations/0004_remove_crawl_output_dir.py create mode 100644 archivebox/machine/migrations/0011_remove_binary_output_dir.py create mode 100644 archivebox/personas/migrations/0002_alter_persona_id.py rename archivebox/plugins/{archive_org => archivedotorg}/config.json (100%) rename archivebox/plugins/{archive_org/on_Snapshot__13_archive_org.py => archivedotorg/on_Snapshot__13_archivedotorg.py} (95%) rename archivebox/plugins/{archive_org => archivedotorg}/templates/icon.html (100%) rename archivebox/plugins/{archive_org => archivedotorg}/templates/thumbnail.html (100%) rename archivebox/plugins/{archive_org/tests/test_archive_org.py => archivedotorg/tests/test_archivedotorg.py} (95%) create mode 100644 archivebox/tests/test_cli_run_binary_worker.py diff --git a/.claude/settings.local.json b/.claude/settings.local.json index ae6afbbb..abce917c 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -1,6 +1,9 @@ { "permissions": { "allow": [ + "Read(**)", + "Glob(**)", + "Grep(**)", "Bash(python -m archivebox:*)", "Bash(ls:*)", "Bash(xargs:*)", @@ -29,5 +32,19 @@ "Bash(done)", "Bash(coverage erase:*)" ] + }, + "hooks": { + "PreToolUse": [ + { + "matcher": "Bash", + "hooks": [ + { + "type": "command", + "command": "REPO_ROOT=$(git rev-parse --show-toplevel 2>/dev/null); if [ -n \"$REPO_ROOT\" ] && [ \"$PWD\" != \"$REPO_ROOT\" ]; then echo \"ERROR: Not in repo root ($REPO_ROOT). Current dir: $PWD\" >&2; exit 1; fi", + "statusMessage": "Checking working directory..." + } + ] + } + ] } } diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py index 7d0bbb05..c036edd1 100755 --- a/archivebox/base_models/models.py +++ b/archivebox/base_models/models.py @@ -128,4 +128,4 @@ class ModelWithOutputDir(ModelWithUUID): @property def output_dir(self) -> Path: - raise NotImplementedError(f'{self.__class__.__name__} must implement output_dir property') + raise NotImplementedError(f"{self.__class__.__name__} must implement output_dir property") diff --git a/archivebox/cli/archivebox_run.py b/archivebox/cli/archivebox_run.py index 0beed5e2..f27c2cb3 100644 --- a/archivebox/cli/archivebox_run.py +++ b/archivebox/cli/archivebox_run.py @@ -59,10 +59,11 @@ def process_stdin_records() -> int: """ from django.utils import timezone - from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT + from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_BINARY from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.core.models import Snapshot, ArchiveResult from archivebox.crawls.models import Crawl + from archivebox.machine.models import Binary from archivebox.workers.orchestrator import Orchestrator records = list(read_stdin()) @@ -137,6 +138,26 @@ def process_stdin_records() -> int: output_records.append(archiveresult.to_json()) queued_count += 1 + elif record_type == TYPE_BINARY: + # Binary records - create or update and queue for installation + if record_id: + # Existing binary - re-queue + try: + binary = Binary.objects.get(id=record_id) + except Binary.DoesNotExist: + binary = Binary.from_json(record) + else: + # New binary - create it + binary = Binary.from_json(record) + + if binary: + binary.retry_at = timezone.now() + if binary.status != Binary.StatusChoices.INSTALLED: + binary.status = Binary.StatusChoices.QUEUED + binary.save() + output_records.append(binary.to_json()) + queued_count += 1 + else: # Unknown type - pass through output_records.append(record) @@ -222,7 +243,8 @@ def run_snapshot_worker(snapshot_id: str) -> int: @click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)") @click.option('--crawl-id', help="Run orchestrator for specific crawl only") @click.option('--snapshot-id', help="Run worker for specific snapshot only") -def main(daemon: bool, crawl_id: str, snapshot_id: str): +@click.option('--binary-id', help="Run worker for specific binary only") +def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str): """ Process queued work. @@ -231,11 +253,27 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str): - No args + TTY: Run orchestrator for all work - --crawl-id: Run orchestrator for that crawl only - --snapshot-id: Run worker for that snapshot only + - --binary-id: Run worker for that binary only """ # Snapshot worker mode if snapshot_id: sys.exit(run_snapshot_worker(snapshot_id)) + # Binary worker mode + if binary_id: + from archivebox.workers.worker import BinaryWorker + try: + worker = BinaryWorker(binary_id=binary_id, worker_id=0) + worker.runloop() + sys.exit(0) + except KeyboardInterrupt: + sys.exit(0) + except Exception as e: + rprint(f'[red]Worker error: {type(e).__name__}: {e}[/red]', file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) + # Crawl worker mode if crawl_id: from archivebox.workers.worker import CrawlWorker diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index 805cb86e..d4a02141 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -123,6 +123,7 @@ def get_config( user: Any = None, crawl: Any = None, snapshot: Any = None, + archiveresult: Any = None, machine: Any = None, ) -> Dict[str, Any]: """ @@ -145,11 +146,26 @@ def get_config( user: User object with config JSON field crawl: Crawl object with config JSON field snapshot: Snapshot object with config JSON field + archiveresult: ArchiveResult object (auto-fetches snapshot) machine: Machine object with config JSON field (defaults to Machine.current()) + Note: Objects are auto-fetched from relationships if not provided: + - snapshot auto-fetched from archiveresult.snapshot + - crawl auto-fetched from snapshot.crawl + - user auto-fetched from crawl.created_by + Returns: Merged config dict """ + # Auto-fetch related objects from relationships + if snapshot is None and archiveresult and hasattr(archiveresult, "snapshot"): + snapshot = archiveresult.snapshot + + if crawl is None and snapshot and hasattr(snapshot, "crawl"): + crawl = snapshot.crawl + + if user is None and crawl and hasattr(crawl, "created_by"): + user = crawl.created_by from archivebox.config.constants import CONSTANTS from archivebox.config.common import ( SHELL_CONFIG, @@ -197,12 +213,18 @@ def get_config( if machine and hasattr(machine, "config") and machine.config: config.update(machine.config) - # Override with environment variables + # Override with environment variables (for keys that exist in config) for key in config: env_val = os.environ.get(key) if env_val is not None: config[key] = _parse_env_value(env_val, config.get(key)) + # Also add NEW environment variables (not yet in config) + # This is important for worker subprocesses that receive config via Process.env + for key, value in os.environ.items(): + if key.isupper() and key not in config: # Only uppercase keys (config convention) + config[key] = _parse_env_value(value, None) + # Also check plugin config aliases in environment try: from archivebox.hooks import discover_plugin_configs @@ -335,7 +357,7 @@ DEFAULT_WORKER_CONCURRENCY = { "title": 5, "favicon": 5, "headers": 5, - "archive_org": 2, + "archivedotorg": 2, "readability": 3, "mercury": 3, "git": 2, diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index b749951d..0db937ac 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -147,7 +147,7 @@ class AddLinkForm(forms.Form): 'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title' } archiving = { - 'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git', + 'archivedotorg', 'favicon', 'forumdl', 'gallerydl', 'git', 'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget' } parsing = { diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index 407e3eda..c052f9ce 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -120,7 +120,7 @@ class Migration(migrations.Migration): ('output', models.CharField(max_length=512)), ('start_ts', models.DateTimeField()), ('end_ts', models.DateTimeField()), - ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)), + ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archivedotorg', 'archivedotorg')], max_length=32)), ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')), ], ), diff --git a/archivebox/core/migrations/0011_auto_20210216_1331.py b/archivebox/core/migrations/0011_auto_20210216_1331.py index d2226674..c00d90ca 100644 --- a/archivebox/core/migrations/0011_auto_20210216_1331.py +++ b/archivebox/core/migrations/0011_auto_20210216_1331.py @@ -19,6 +19,6 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='archiveresult', name='extractor', - field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), + field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32), ), ] diff --git a/archivebox/core/migrations/0021_auto_20220914_0934.py b/archivebox/core/migrations/0021_auto_20220914_0934.py index 4ef09034..d33f785e 100644 --- a/archivebox/core/migrations/0021_auto_20220914_0934.py +++ b/archivebox/core/migrations/0021_auto_20220914_0934.py @@ -13,6 +13,6 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='archiveresult', name='extractor', - field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), + field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32), ), ] diff --git a/archivebox/core/migrations/0022_auto_20231023_2008.py b/archivebox/core/migrations/0022_auto_20231023_2008.py index 1b0becef..ffb41fbd 100644 --- a/archivebox/core/migrations/0022_auto_20231023_2008.py +++ b/archivebox/core/migrations/0022_auto_20231023_2008.py @@ -13,6 +13,6 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='archiveresult', name='extractor', - field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), + field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32), ), ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index ed2fc534..f86ef048 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1973,7 +1973,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea canonical = { 'index_path': 'index.html', 'google_favicon_path': FAVICON_PROVIDER.format(self.domain), - 'archive_org_path': f'https://web.archive.org/web/{self.base_url}', + 'archivedotorg_path': f'https://web.archive.org/web/{self.base_url}', } # Scan each ArchiveResult's output directory for the best file diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index bd1276f6..095db8ea 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -206,7 +206,10 @@ DATABASES = { } MIGRATION_MODULES = {"signal_webhooks": None} -# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0 +# Django requires DEFAULT_AUTO_FIELD to subclass AutoField (BigAutoField, SmallAutoField, etc.) +# Cannot use UUIDField here until Django 6.0 introduces DEFAULT_PK_FIELD setting +# For now: manually add `id = models.UUIDField(primary_key=True, default=uuid7, ...)` to all models +# OR inherit from ModelWithUUID base class which provides UUID primary key DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" diff --git a/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py b/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py new file mode 100644 index 00000000..3de115bc --- /dev/null +++ b/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py @@ -0,0 +1,17 @@ +# Generated by Django 6.0 on 2026-01-05 01:09 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('crawls', '0003_remove_crawlschedule_num_uses_failed_and_more'), + ] + + operations = [ + migrations.RemoveField( + model_name='crawl', + name='output_dir', + ), + ] diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 69de28ba..900f8c3a 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -370,20 +370,34 @@ def run_hook( from archivebox.machine.models import Machine machine = Machine.current() if machine and machine.config: - machine_path = machine.config.get('config/PATH') + machine_path = machine.config.get('PATH') if machine_path: # Prepend LIB_BIN_DIR to machine PATH as well if lib_bin_dir and not machine_path.startswith(f'{lib_bin_dir}:'): env['PATH'] = f'{lib_bin_dir}:{machine_path}' else: env['PATH'] = machine_path - # Also set NODE_MODULES_DIR if configured - node_modules_dir = machine.config.get('config/NODE_MODULES_DIR') - if node_modules_dir: - env['NODE_MODULES_DIR'] = node_modules_dir except Exception: pass # Fall back to system PATH if Machine not available + # Set NODE_PATH for Node.js module resolution + # Priority: config dict > Machine.config > derive from LIB_DIR + node_path = config.get('NODE_PATH') + if not node_path and lib_dir: + # Derive from LIB_DIR/npm/node_modules + node_modules_dir = Path(lib_dir) / 'npm' / 'node_modules' + if node_modules_dir.exists(): + node_path = str(node_modules_dir) + if not node_path: + try: + # Fallback to Machine.config + node_path = machine.config.get('NODE_MODULES_DIR') + except Exception: + pass + if node_path: + env['NODE_PATH'] = node_path + env['NODE_MODULES_DIR'] = node_path # For backwards compatibility + # Export all config values to environment (already merged by get_config()) for key, value in config.items(): if value is None: @@ -414,56 +428,8 @@ def run_hook( timeout=timeout, ) - # Build environment from config (Process._build_env() expects self.env dict) - # We need to set env on the process before launching - process.env = {} - for key, value in config.items(): - if value is None: - continue - elif isinstance(value, bool): - process.env[key] = 'true' if value else 'false' - elif isinstance(value, (list, dict)): - process.env[key] = json.dumps(value) - else: - process.env[key] = str(value) - - # Add base paths to env - process.env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd())) - process.env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive')) - process.env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', '')) - - # Add LIB_DIR and LIB_BIN_DIR - lib_dir = config.get('LIB_DIR', getattr(settings, 'LIB_DIR', None)) - lib_bin_dir = config.get('LIB_BIN_DIR', getattr(settings, 'LIB_BIN_DIR', None)) - if lib_dir: - process.env['LIB_DIR'] = str(lib_dir) - if not lib_bin_dir and lib_dir: - lib_bin_dir = Path(lib_dir) / 'bin' - if lib_bin_dir: - process.env['LIB_BIN_DIR'] = str(lib_bin_dir) - - # Set PATH from Machine.config if available - try: - if machine and machine.config: - machine_path = machine.config.get('config/PATH') - if machine_path: - # Prepend LIB_BIN_DIR to machine PATH as well - if lib_bin_dir and not machine_path.startswith(f'{lib_bin_dir}:'): - process.env['PATH'] = f'{lib_bin_dir}:{machine_path}' - else: - process.env['PATH'] = machine_path - elif lib_bin_dir: - # Just prepend to current PATH - current_path = os.environ.get('PATH', '') - if not current_path.startswith(f'{lib_bin_dir}:'): - process.env['PATH'] = f'{lib_bin_dir}:{current_path}' if current_path else str(lib_bin_dir) - - # Also set NODE_MODULES_DIR if configured - node_modules_dir = machine.config.get('config/NODE_MODULES_DIR') - if node_modules_dir: - process.env['NODE_MODULES_DIR'] = node_modules_dir - except Exception: - pass # Fall back to system PATH if Machine not available + # Copy the env dict we already built (includes os.environ + all customizations) + process.env = env.copy() # Save env before launching process.save() diff --git a/archivebox/machine/migrations/0011_remove_binary_output_dir.py b/archivebox/machine/migrations/0011_remove_binary_output_dir.py new file mode 100644 index 00000000..fc29b9bb --- /dev/null +++ b/archivebox/machine/migrations/0011_remove_binary_output_dir.py @@ -0,0 +1,17 @@ +# Generated by Django 6.0 on 2026-01-05 01:09 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('machine', '0010_alter_process_process_type'), + ] + + operations = [ + migrations.RemoveField( + model_name='binary', + name='output_dir', + ), + ] diff --git a/archivebox/personas/apps.py b/archivebox/personas/apps.py index 9a1cfb90..df45c266 100644 --- a/archivebox/personas/apps.py +++ b/archivebox/personas/apps.py @@ -1,7 +1,7 @@ from django.apps import AppConfig -class SessionsConfig(AppConfig): +class PersonasConfig(AppConfig): default_auto_field = "django.db.models.BigAutoField" name = "archivebox.personas" label = "personas" diff --git a/archivebox/personas/migrations/0001_initial.py b/archivebox/personas/migrations/0001_initial.py index d85613c3..f110d526 100644 --- a/archivebox/personas/migrations/0001_initial.py +++ b/archivebox/personas/migrations/0001_initial.py @@ -1,6 +1,7 @@ # Generated by Django 6.0 on 2025-12-31 09:06 import archivebox.base_models.models +from archivebox.uuid_compat import uuid7 import django.db.models.deletion import django.utils.timezone from django.conf import settings @@ -19,7 +20,7 @@ class Migration(migrations.Migration): migrations.CreateModel( name='Persona', fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('id', models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)), ('config', models.JSONField(blank=True, default=dict, null=True)), ('name', models.CharField(max_length=64, unique=True)), ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), diff --git a/archivebox/personas/migrations/0002_alter_persona_id.py b/archivebox/personas/migrations/0002_alter_persona_id.py new file mode 100644 index 00000000..e8e5af2a --- /dev/null +++ b/archivebox/personas/migrations/0002_alter_persona_id.py @@ -0,0 +1,19 @@ +# Generated by Django 6.0 on 2026-01-05 01:09 + +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('personas', '0001_initial'), + ] + + operations = [ + migrations.AlterField( + model_name='persona', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + ] diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py index 470ec846..1bf5b1a0 100644 --- a/archivebox/personas/models.py +++ b/archivebox/personas/models.py @@ -19,6 +19,7 @@ from django.conf import settings from django.utils import timezone from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk +from archivebox.uuid_compat import uuid7 if TYPE_CHECKING: from django.db.models import QuerySet @@ -44,6 +45,7 @@ class Persona(ModelWithConfig): persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data """ + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) name = models.CharField(max_length=64, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) diff --git a/archivebox/plugins/archive_org/config.json b/archivebox/plugins/archivedotorg/config.json similarity index 100% rename from archivebox/plugins/archive_org/config.json rename to archivebox/plugins/archivedotorg/config.json diff --git a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py b/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py similarity index 95% rename from archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py rename to archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py index 820c261f..5490008d 100644 --- a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py +++ b/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py @@ -2,7 +2,7 @@ """ Submit a URL to archive.org for archiving. -Usage: on_Snapshot__archive_org.py --url= --snapshot-id= +Usage: on_Snapshot__archivedotorg.py --url= --snapshot-id= Output: Writes archive.org.txt to $PWD with the archived URL Environment variables: @@ -25,7 +25,7 @@ import rich_click as click # Extractor metadata -PLUGIN_NAME = 'archive_org' +PLUGIN_NAME = 'archivedotorg' OUTPUT_DIR = '.' OUTPUT_FILE = 'archive.org.txt' @@ -41,7 +41,7 @@ def get_env_int(name: str, default: int = 0) -> int: return default -def submit_to_archive_org(url: str) -> tuple[bool, str | None, str]: +def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]: """ Submit URL to archive.org Wayback Machine. @@ -113,7 +113,7 @@ def main(url: str, snapshot_id: str): try: # Run extraction - success, output, error = submit_to_archive_org(url) + success, output, error = submit_to_archivedotorg(url) if success: # Success - emit ArchiveResult with output file diff --git a/archivebox/plugins/archive_org/templates/icon.html b/archivebox/plugins/archivedotorg/templates/icon.html similarity index 100% rename from archivebox/plugins/archive_org/templates/icon.html rename to archivebox/plugins/archivedotorg/templates/icon.html diff --git a/archivebox/plugins/archive_org/templates/thumbnail.html b/archivebox/plugins/archivedotorg/templates/thumbnail.html similarity index 100% rename from archivebox/plugins/archive_org/templates/thumbnail.html rename to archivebox/plugins/archivedotorg/templates/thumbnail.html diff --git a/archivebox/plugins/archive_org/tests/test_archive_org.py b/archivebox/plugins/archivedotorg/tests/test_archivedotorg.py similarity index 95% rename from archivebox/plugins/archive_org/tests/test_archive_org.py rename to archivebox/plugins/archivedotorg/tests/test_archivedotorg.py index d43fd962..1e4b4a97 100644 --- a/archivebox/plugins/archive_org/tests/test_archive_org.py +++ b/archivebox/plugins/archivedotorg/tests/test_archivedotorg.py @@ -1,5 +1,5 @@ """ -Integration tests for archive_org plugin +Integration tests for archivedotorg plugin Tests verify standalone archive.org extractor execution. """ @@ -12,13 +12,13 @@ from pathlib import Path import pytest PLUGIN_DIR = Path(__file__).parent.parent -ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archive_org.*'), None) +ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archivedotorg.*'), None) TEST_URL = 'https://example.com' def test_hook_script_exists(): assert ARCHIVEDOTORG_HOOK.exists() -def test_submits_to_archive_org(): +def test_submits_to_archivedotorg(): with tempfile.TemporaryDirectory() as tmpdir: result = subprocess.run( [sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'], @@ -49,7 +49,7 @@ def test_submits_to_archive_org(): assert not result_json, "Should NOT emit JSONL on transient error" assert result.stderr, "Should have error message in stderr" -def test_config_save_archive_org_false_skips(): +def test_config_save_archivedotorg_false_skips(): with tempfile.TemporaryDirectory() as tmpdir: import os env = os.environ.copy() diff --git a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js index 52c49b59..d99460c9 100644 --- a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js +++ b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js @@ -26,11 +26,20 @@ const { readCdpUrl, } = require('../chrome/chrome_utils.js'); +// Flush V8 coverage before exit (needed for NODE_V8_COVERAGE to capture early exits) +function flushCoverageAndExit(exitCode) { + if (process.env.NODE_V8_COVERAGE) { + const v8 = require('v8'); + v8.takeCoverage(); + } + process.exit(exitCode); +} + // Check if screenshot is enabled BEFORE requiring puppeteer if (!getEnvBool('SCREENSHOT_ENABLED', true)) { console.error('Skipping screenshot (SCREENSHOT_ENABLED=False)'); // Temporary failure (config disabled) - NO JSONL emission - process.exit(0); + flushCoverageAndExit(0); } // Now safe to require puppeteer @@ -135,7 +144,7 @@ async function main() { if (!url || !snapshotId) { console.error('Usage: on_Snapshot__51_screenshot.js --url= --snapshot-id='); - process.exit(1); + flushCoverageAndExit(1); } // Check if staticfile extractor already handled this (permanent skip) @@ -147,7 +156,7 @@ async function main() { status: 'skipped', output_str: 'staticfile already handled', })); - process.exit(0); + flushCoverageAndExit(0); } // Take screenshot (throws on error) @@ -166,5 +175,5 @@ async function main() { main().catch(e => { // Transient error - emit NO JSONL console.error(`ERROR: ${e.message}`); - process.exit(1); + flushCoverageAndExit(1); }); diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py index d3f09c30..2d804757 100644 --- a/archivebox/plugins/screenshot/tests/test_screenshot.py +++ b/archivebox/plugins/screenshot/tests/test_screenshot.py @@ -226,9 +226,6 @@ def test_config_save_screenshot_false_skips(): print(f"[DEBUG RESULT] Exit code: {result.returncode}") print(f"[DEBUG RESULT] Stderr: {result.stderr[:200]}") - # FORCE FAILURE to verify test actually runs - assert False, f"FORCED FAILURE - NODE_V8_COVERAGE={'NODE_V8_COVERAGE' in env} value={env.get('NODE_V8_COVERAGE', 'NOTSET')}" - assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" # Feature disabled - temporary failure, should NOT emit JSONL diff --git a/archivebox/plugins/ytdlp/config.json b/archivebox/plugins/ytdlp/config.json index eb76ac3b..2a98e24e 100644 --- a/archivebox/plugins/ytdlp/config.json +++ b/archivebox/plugins/ytdlp/config.json @@ -74,7 +74,7 @@ "--geo-bypass", "--add-metadata", "--no-progress", - "--remote-components ejs:github", + "--remote-components=ejs:github", "-o", "%(title)s.%(ext)s" ], diff --git a/archivebox/tests/test_cli_run_binary_worker.py b/archivebox/tests/test_cli_run_binary_worker.py new file mode 100644 index 00000000..25fefacd --- /dev/null +++ b/archivebox/tests/test_cli_run_binary_worker.py @@ -0,0 +1,256 @@ +""" +Tests for BinaryWorker processing Binary queue. + +Tests cover: +- BinaryWorker is spawned by Orchestrator when Binary queue has work +- Binary hooks (on_Binary__*) actually run and install binaries +- Binary status transitions from QUEUED -> INSTALLED +- BinaryWorker exits after idle timeout +""" + +import json +import sqlite3 +import time + +from archivebox.tests.conftest import ( + run_archivebox_cmd, + parse_jsonl_output, +) + + +class TestBinaryWorkerSpawning: + """Tests for BinaryWorker lifecycle.""" + + def test_binary_worker_spawns_when_binary_queued(self, initialized_archive): + """Orchestrator spawns BinaryWorker when Binary queue has work.""" + # Create a Binary record via CLI + binary_record = { + 'type': 'Binary', + 'name': 'python3', + 'binproviders': 'env', # Use env provider to detect system python + } + + # Use `archivebox run` to create the Binary (this queues it) + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(binary_record), + data_dir=initialized_archive, + timeout=30, + ) + + assert code == 0, f"Failed to create Binary: {stderr}" + + # Verify Binary was created in DB + conn = sqlite3.connect(initialized_archive / 'index.sqlite3') + c = conn.cursor() + binaries = c.execute( + "SELECT name, status, abspath FROM machine_binary WHERE name='python3'" + ).fetchall() + conn.close() + + assert len(binaries) >= 1, "Binary was not created in database" + name, status, abspath = binaries[0] + assert name == 'python3' + # Status should be INSTALLED after BinaryWorker processed it + # (or QUEUED if worker timed out before installing) + assert status in ['installed', 'queued'] + + + def test_binary_hooks_actually_run(self, initialized_archive): + """Binary installation hooks (on_Binary__*) run and update abspath.""" + # Create a Binary for python3 (guaranteed to exist on system) + binary_record = { + 'type': 'Binary', + 'name': 'python3', + 'binproviders': 'env', + } + + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(binary_record), + data_dir=initialized_archive, + timeout=30, + ) + + assert code == 0, f"Failed to process Binary: {stderr}" + + # Query database to check if hooks ran and populated abspath + conn = sqlite3.connect(initialized_archive / 'index.sqlite3') + c = conn.cursor() + result = c.execute( + "SELECT name, status, abspath, version FROM machine_binary WHERE name='python3'" + ).fetchone() + conn.close() + + assert result is not None, "Binary not found in database" + name, status, abspath, version = result + + # If hooks ran successfully, abspath should be populated + if status == 'installed': + assert abspath, f"Binary installed but abspath is empty: {abspath}" + assert '/python3' in abspath or '\\python3' in abspath, \ + f"abspath doesn't look like a python3 path: {abspath}" + # Version should also be populated + assert version, f"Binary installed but version is empty: {version}" + + + def test_binary_status_transitions(self, initialized_archive): + """Binary status correctly transitions QUEUED -> INSTALLED.""" + binary_record = { + 'type': 'Binary', + 'name': 'python3', + 'binproviders': 'env', + } + + # Create and process the Binary + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(binary_record), + data_dir=initialized_archive, + timeout=30, + ) + + assert code == 0 + + # Check final status + conn = sqlite3.connect(initialized_archive / 'index.sqlite3') + c = conn.cursor() + status = c.execute( + "SELECT status FROM machine_binary WHERE name='python3'" + ).fetchone() + conn.close() + + assert status is not None + # Should be installed (or queued if worker timed out) + assert status[0] in ['installed', 'queued'] + + +class TestBinaryWorkerHooks: + """Tests for specific Binary hook providers.""" + + def test_env_provider_hook_detects_system_binary(self, initialized_archive): + """on_Binary__15_env_install.py hook detects system binaries.""" + binary_record = { + 'type': 'Binary', + 'name': 'python3', + 'binproviders': 'env', + } + + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(binary_record), + data_dir=initialized_archive, + timeout=30, + ) + + assert code == 0 + + # Check that env provider hook populated the Binary + conn = sqlite3.connect(initialized_archive / 'index.sqlite3') + c = conn.cursor() + result = c.execute( + "SELECT binprovider, abspath FROM machine_binary WHERE name='python3' AND status='installed'" + ).fetchone() + conn.close() + + if result: + binprovider, abspath = result + assert binprovider == 'env', f"Expected env provider, got: {binprovider}" + assert abspath, "abspath should be populated by env provider" + + + def test_multiple_binaries_processed_in_batch(self, initialized_archive): + """BinaryWorker processes multiple queued binaries.""" + # Create multiple Binary records + binaries = [ + {'type': 'Binary', 'name': 'python3', 'binproviders': 'env'}, + {'type': 'Binary', 'name': 'curl', 'binproviders': 'env'}, + ] + + stdin = '\n'.join(json.dumps(b) for b in binaries) + + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=stdin, + data_dir=initialized_archive, + timeout=45, + ) + + assert code == 0 + + # Both should be processed + conn = sqlite3.connect(initialized_archive / 'index.sqlite3') + c = conn.cursor() + installed = c.execute( + "SELECT name FROM machine_binary WHERE name IN ('python3', 'curl')" + ).fetchall() + conn.close() + + assert len(installed) >= 1, "At least one binary should be created" + + +class TestBinaryWorkerEdgeCases: + """Tests for edge cases and error handling.""" + + def test_nonexistent_binary_stays_queued(self, initialized_archive): + """Binary that doesn't exist stays queued (doesn't fail permanently).""" + binary_record = { + 'type': 'Binary', + 'name': 'nonexistent-binary-xyz-12345', + 'binproviders': 'env', + } + + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(binary_record), + data_dir=initialized_archive, + timeout=30, + ) + + # Command should still succeed (orchestrator doesn't fail on binary install failures) + assert code == 0 + + # Binary should remain queued (not installed) + conn = sqlite3.connect(initialized_archive / 'index.sqlite3') + c = conn.cursor() + result = c.execute( + "SELECT status FROM machine_binary WHERE name='nonexistent-binary-xyz-12345'" + ).fetchone() + conn.close() + + if result: + status = result[0] + # Should stay queued since installation failed + assert status == 'queued', f"Expected queued, got: {status}" + + + def test_binary_worker_respects_machine_isolation(self, initialized_archive): + """BinaryWorker only processes binaries for current machine.""" + # This is implicitly tested by other tests - Binary.objects.filter(machine=current) + # ensures only current machine's binaries are processed + binary_record = { + 'type': 'Binary', + 'name': 'python3', + 'binproviders': 'env', + } + + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(binary_record), + data_dir=initialized_archive, + timeout=30, + ) + + assert code == 0 + + # Check that machine_id is set correctly + conn = sqlite3.connect(initialized_archive / 'index.sqlite3') + c = conn.cursor() + result = c.execute( + "SELECT machine_id FROM machine_binary WHERE name='python3'" + ).fetchone() + conn.close() + + assert result is not None + machine_id = result[0] + assert machine_id, "machine_id should be set on Binary" diff --git a/archivebox/tests/test_worker_config_propagation.py b/archivebox/tests/test_worker_config_propagation.py index 487cbf15..30c5e4d9 100644 --- a/archivebox/tests/test_worker_config_propagation.py +++ b/archivebox/tests/test_worker_config_propagation.py @@ -246,6 +246,68 @@ print(snapshot.id) "Expected debug output not found in stderr" print("✓ Config debug output found in stderr") + # Verify precedence order: snapshot > crawl > user > persona > env > machine > file > defaults + verify_precedence_script = f""" +import os +os.environ['DATA_DIR'] = '{data_dir}' + +from archivebox.config.django import setup_django +setup_django() + +from archivebox.core.models import Snapshot +from archivebox.config.configset import get_config + +snapshot = Snapshot.objects.get(id='{snapshot_id}') + +# Test precedence by getting config at different levels +print("\\nTesting config precedence order:") + +# 1. Just defaults (lowest priority) +config_defaults = get_config() +print(f" Defaults only: TIMEOUT={{config_defaults.get('TIMEOUT')}}") + +# 2. With machine config +from archivebox.machine.models import Machine +machine = Machine.current() +config_machine = get_config(machine=machine) +custom_machine = config_machine.get('CUSTOM_MACHINE_KEY') +print(f" + Machine: CUSTOM_MACHINE_KEY={{custom_machine}}") + +# 3. With crawl config +config_crawl = get_config(crawl=snapshot.crawl) +print(f" + Crawl: TIMEOUT={{config_crawl.get('TIMEOUT')}} (should be 777 from crawl.config)") +assert config_crawl.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl.get('TIMEOUT')}}" + +# 4. With snapshot config (highest priority) +config_snapshot = get_config(snapshot=snapshot) +print(f" + Snapshot: TIMEOUT={{config_snapshot.get('TIMEOUT')}} (should be 555 from snapshot.config)") +assert config_snapshot.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config_snapshot.get('TIMEOUT')}}" + +# Verify snapshot config overrides crawl config +assert config_snapshot.get('CUSTOM_CRAWL_KEY') == 'from_crawl_json', "Crawl config should be present" +assert config_snapshot.get('CUSTOM_SNAPSHOT_KEY') == 'from_snapshot_json', "Snapshot config should be present" +assert config_snapshot.get('CUSTOM_MACHINE_KEY') == 'from_machine_config', "Machine config should be present" + +print("\\n✓ Config precedence order verified: snapshot > crawl > machine > defaults") +""" + result = subprocess.run( + ['python', '-c', verify_precedence_script], + cwd=str(data_dir.parent), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=30, + ) + + print(result.stdout.decode()) + if result.returncode != 0: + print("\nPrecedence verification error:") + print(result.stderr.decode()) + assert result.returncode == 0, f"Precedence verification failed: {result.stderr.decode()}" + # Verify config values were actually used by checking ArchiveResults verify_script = f""" import os @@ -475,7 +537,453 @@ print("\\n✓ All config values correctly parsed from environment") print("="*80 + "\n") +def test_parent_environment_preserved_in_hooks(): + """ + Test that parent environment variables are preserved in hook execution. + + This test catches the bug where we built env=os.environ.copy() but then + clobbered it with process.env={}, losing all parent environment. + + Also verifies: + - NODE_PATH is correctly derived from LIB_DIR/npm/node_modules + - LIB_BIN_DIR is correctly derived and added to PATH + """ + + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) / 'test_archive' + data_dir.mkdir() + + print(f"\n{'='*80}") + print(f"Test: Parent Environment Preserved in Hooks") + print(f"DATA_DIR: {data_dir}") + print(f"{'='*80}\n") + + # Initialize archive + print("Step 1: Initialize archive") + result = subprocess.run( + ['python', '-m', 'archivebox', 'init'], + cwd=str(data_dir), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=60, + ) + assert result.returncode == 0, f"Init failed: {result.stderr.decode()}" + print(f"✓ Archive initialized\n") + + # Create snapshot + print("Step 2: Create Snapshot") + create_snapshot_script = f""" +import os +os.environ['DATA_DIR'] = '{data_dir}' + +from archivebox.config.django import setup_django +setup_django() + +from django.utils import timezone +from archivebox.core.models import Snapshot +from archivebox.crawls.models import Crawl + +crawl = Crawl.objects.create( + urls='https://example.com', + status='queued', + retry_at=timezone.now() +) + +snapshot = Snapshot.objects.create( + url='https://example.com', + crawl=crawl, + status='queued', + retry_at=timezone.now() +) +print(snapshot.id) +""" + result = subprocess.run( + ['python', '-c', create_snapshot_script], + cwd=str(data_dir.parent), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=30, + ) + assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}" + snapshot_id = result.stdout.decode().strip().split('\n')[-1] + print(f"✓ Created snapshot {snapshot_id}\n") + + # Run SnapshotWorker with custom parent environment variable + print("Step 3: Run SnapshotWorker with TEST_PARENT_ENV_VAR in parent process") + result = subprocess.run( + ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id], + cwd=str(data_dir), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + 'TEST_PARENT_ENV_VAR': 'preserved_from_parent', # This should reach the hook + 'PLUGINS': 'favicon', # Use existing plugin (favicon is simple and fast) + }, + capture_output=True, + timeout=120, + ) + + stdout = result.stdout.decode() + stderr = result.stderr.decode() + + print("\n--- SnapshotWorker stderr (first 50 lines) ---") + print('\n'.join(stderr.split('\n')[:50])) + print("--- End stderr ---\n") + + # Verify hooks ran by checking Process records + print("Step 4: Verify environment variables in hook Process records") + verify_env_script = f""" +import os +os.environ['DATA_DIR'] = '{data_dir}' + +from archivebox.config.django import setup_django +setup_django() + +from archivebox.machine.models import Process +from archivebox.core.models import Snapshot +import json + +snapshot = Snapshot.objects.get(id='{snapshot_id}') + +# Find hook processes for this snapshot +hook_processes = Process.objects.filter( + process_type=Process.TypeChoices.HOOK, + pwd__contains=str(snapshot.id) +).order_by('-created_at') + +print(f"Found {{hook_processes.count()}} hook processes") + +if hook_processes.count() == 0: + print("ERROR: No hook processes found!") + import sys + sys.exit(1) + +# Check the first hook process environment +hook_process = hook_processes.first() +print(f"\\nChecking hook: {{hook_process.cmd}}") +print(f"Hook env keys: {{len(hook_process.env)}} total") + +# Verify TEST_PARENT_ENV_VAR was preserved +test_parent = hook_process.env.get('TEST_PARENT_ENV_VAR') +print(f" TEST_PARENT_ENV_VAR: {{test_parent}}") +assert test_parent == 'preserved_from_parent', f"Expected 'preserved_from_parent', got {{test_parent}}" + +# Verify LIB_DIR is set +lib_dir = hook_process.env.get('LIB_DIR') +print(f" LIB_DIR: {{lib_dir}}") +assert lib_dir is not None, "LIB_DIR not set" + +# Verify LIB_BIN_DIR is derived +lib_bin_dir = hook_process.env.get('LIB_BIN_DIR') +print(f" LIB_BIN_DIR: {{lib_bin_dir}}") +if lib_dir: + assert lib_bin_dir is not None, "LIB_BIN_DIR not derived from LIB_DIR" + assert lib_bin_dir.endswith('/bin'), f"LIB_BIN_DIR should end with /bin, got {{lib_bin_dir}}" + +# Verify LIB_BIN_DIR is in PATH +path = hook_process.env.get('PATH') +if lib_bin_dir: + assert lib_bin_dir in path, f"LIB_BIN_DIR not in PATH. LIB_BIN_DIR={{lib_bin_dir}}, PATH={{path[:200]}}..." + +# Verify NODE_PATH is set +node_path = hook_process.env.get('NODE_PATH') +node_modules_dir = hook_process.env.get('NODE_MODULES_DIR') +print(f" NODE_PATH: {{node_path}}") +print(f" NODE_MODULES_DIR: {{node_modules_dir}}") +if node_path: + # Should also have NODE_MODULES_DIR for backwards compatibility + assert node_modules_dir == node_path, f"NODE_MODULES_DIR should match NODE_PATH" + +print("\\n✓ All environment checks passed") +""" + result = subprocess.run( + ['python', '-c', verify_env_script], + cwd=str(data_dir.parent), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=30, + ) + + print(result.stdout.decode()) + if result.returncode != 0: + print("\nVerification error:") + print(result.stderr.decode()) + + assert result.returncode == 0, f"Environment verification failed: {result.stderr.decode()}" + + print("\n" + "="*80) + print("✓ TEST PASSED: Parent environment preserved in hooks") + print(" - Custom parent env vars reach hooks") + print(" - LIB_DIR propagated correctly") + print(" - LIB_BIN_DIR derived and added to PATH") + print(" - NODE_PATH/NODE_MODULES_DIR set when available") + print("="*80 + "\n") + + +def test_config_auto_fetch_relationships(): + """ + Test that get_config() auto-fetches related objects from relationships. + + Verifies: + - snapshot auto-fetched from archiveresult.snapshot + - crawl auto-fetched from snapshot.crawl + - user auto-fetched from crawl.created_by + """ + + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) / 'test_archive' + data_dir.mkdir() + + print(f"\n{'='*80}") + print(f"Test: Config Auto-Fetch Relationships") + print(f"DATA_DIR: {data_dir}") + print(f"{'='*80}\n") + + # Initialize archive + print("Step 1: Initialize archive") + result = subprocess.run( + ['python', '-m', 'archivebox', 'init'], + cwd=str(data_dir), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=60, + ) + assert result.returncode == 0, f"Init failed: {result.stderr.decode()}" + print(f"✓ Archive initialized\n") + + # Create objects with config at each level + print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level") + create_objects_script = f""" +import os +os.environ['DATA_DIR'] = '{data_dir}' + +from archivebox.config.django import setup_django +setup_django() + +from django.utils import timezone +from archivebox.crawls.models import Crawl +from archivebox.core.models import Snapshot, ArchiveResult +from archivebox.config.configset import get_config + +# Create crawl with config +crawl = Crawl.objects.create( + urls='https://example.com', + status='queued', + retry_at=timezone.now(), + config={{ + 'CRAWL_KEY': 'from_crawl', + 'TIMEOUT': 777, + }} +) + +# Create snapshot with config +snapshot = Snapshot.objects.create( + url='https://example.com', + crawl=crawl, + status='queued', + retry_at=timezone.now(), + config={{ + 'SNAPSHOT_KEY': 'from_snapshot', + 'TIMEOUT': 555, + }} +) + +# Create ArchiveResult +ar = ArchiveResult.objects.create( + snapshot=snapshot, + plugin='test', + hook_name='test_hook', + status=ArchiveResult.StatusChoices.STARTED +) + +print(f"Created: crawl={{crawl.id}}, snapshot={{snapshot.id}}, ar={{ar.id}}") + +# Test 1: Auto-fetch crawl from snapshot +print("\\nTest 1: get_config(snapshot=snapshot) auto-fetches crawl") +config = get_config(snapshot=snapshot) +assert config.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config.get('TIMEOUT')}}" +assert config.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot, got {{config.get('SNAPSHOT_KEY')}}" +assert config.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl, got {{config.get('CRAWL_KEY')}}" +print("✓ Snapshot config (TIMEOUT=555) overrides crawl config (TIMEOUT=777)") +print("✓ Both snapshot.config and crawl.config values present") + +# Test 2: Auto-fetch snapshot from archiveresult +print("\\nTest 2: get_config(archiveresult=ar) auto-fetches snapshot and crawl") +config_from_ar = get_config(archiveresult=ar) +assert config_from_ar.get('TIMEOUT') == 555, f"Expected 555, got {{config_from_ar.get('TIMEOUT')}}" +assert config_from_ar.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot" +assert config_from_ar.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl" +print("✓ Auto-fetched snapshot from ar.snapshot") +print("✓ Auto-fetched crawl from snapshot.crawl") + +# Test 3: Precedence without auto-fetch (explicit crawl only) +print("\\nTest 3: get_config(crawl=crawl) without snapshot") +config_crawl_only = get_config(crawl=crawl) +assert config_crawl_only.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl_only.get('TIMEOUT')}}" +assert config_crawl_only.get('CRAWL_KEY') == 'from_crawl' +assert config_crawl_only.get('SNAPSHOT_KEY') is None, "Should not have snapshot config" +print("✓ Crawl-only config has TIMEOUT=777") +print("✓ No snapshot config values present") + +print("\\n✓ All auto-fetch tests passed") +""" + + result = subprocess.run( + ['python', '-c', create_objects_script], + cwd=str(data_dir.parent), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=30, + ) + + print(result.stdout.decode()) + if result.returncode != 0: + print("\nAuto-fetch test error:") + print(result.stderr.decode()) + + assert result.returncode == 0, f"Auto-fetch test failed: {result.stderr.decode()}" + + print("\n" + "="*80) + print("✓ TEST PASSED: Config auto-fetches related objects correctly") + print(" - archiveresult → snapshot → crawl → user") + print(" - Precedence preserved during auto-fetch") + print("="*80 + "\n") + + +def test_config_precedence_with_environment_vars(): + """ + Test that config precedence order is correct when environment vars are set. + + Documented order (highest to lowest): + 1. snapshot.config + 2. crawl.config + 3. user.config + 4. persona config + 5. environment variables <-- LOWER priority than snapshot/crawl + 6. machine.config + 7. config file + 8. plugin defaults + 9. core defaults + + This test verifies snapshot.config overrides environment variables. + """ + + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) / 'test_archive' + data_dir.mkdir() + + print(f"\n{'='*80}") + print(f"Test: Config Precedence with Environment Variables") + print(f"DATA_DIR: {data_dir}") + print(f"{'='*80}\n") + + # Initialize + result = subprocess.run( + ['python', '-m', 'archivebox', 'init'], + cwd=str(data_dir), + env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'}, + capture_output=True, + timeout=60, + ) + assert result.returncode == 0 + print("✓ Archive initialized\n") + + # Test with environment variable set + print("Step 1: Test with TIMEOUT=999 in environment") + test_script = f""" +import os +os.environ['DATA_DIR'] = '{data_dir}' +os.environ['TIMEOUT'] = '999' # Set env var + +from archivebox.config.django import setup_django +setup_django() + +from django.utils import timezone +from archivebox.crawls.models import Crawl +from archivebox.core.models import Snapshot +from archivebox.config.configset import get_config + +# Create crawl with TIMEOUT=777 +crawl = Crawl.objects.create( + urls='https://example.com', + status='queued', + retry_at=timezone.now(), + config={{'TIMEOUT': 777}} +) + +# Create snapshot with TIMEOUT=555 +snapshot = Snapshot.objects.create( + url='https://example.com', + crawl=crawl, + status='queued', + retry_at=timezone.now(), + config={{'TIMEOUT': 555}} +) + +# Get config with all sources +config = get_config(snapshot=snapshot) + +print(f"Environment: TIMEOUT={{os.environ.get('TIMEOUT')}}") +print(f"Crawl config: TIMEOUT={{crawl.config.get('TIMEOUT')}}") +print(f"Snapshot config: TIMEOUT={{snapshot.config.get('TIMEOUT')}}") +print(f"Merged config: TIMEOUT={{config.get('TIMEOUT')}}") + +# Snapshot should override both crawl AND environment +expected = 555 +actual = config.get('TIMEOUT') +if actual != expected: + print(f"\\n❌ PRECEDENCE BUG: Expected {{expected}}, got {{actual}}") + print(f" Snapshot.config should have highest priority!") + import sys + sys.exit(1) + +print(f"\\n✓ snapshot.config ({{expected}}) correctly overrides env var (999) and crawl.config (777)") +""" + + result = subprocess.run( + ['python', '-c', test_script], + cwd=str(data_dir.parent), + capture_output=True, + timeout=30, + ) + + print(result.stdout.decode()) + if result.returncode != 0: + print("\nPrecedence bug detected:") + print(result.stderr.decode()) + + assert result.returncode == 0, f"Precedence test failed: {result.stderr.decode()}" + + print("\n" + "="*80) + print("✓ TEST PASSED: Snapshot config correctly overrides environment variables") + print("="*80 + "\n") + + if __name__ == '__main__': # Run as standalone script test_config_propagation_through_worker_hierarchy() test_config_environment_variable_parsing() + test_parent_environment_preserved_in_hooks() + test_config_auto_fetch_relationships() + test_config_precedence_with_environment_vars() diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index 1197aa4c..f6d79180 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -36,7 +36,7 @@ from django.utils import timezone from rich import print from archivebox.misc.logging_util import log_worker_event -from .worker import Worker, CrawlWorker +from .worker import Worker, BinaryWorker, CrawlWorker def _run_orchestrator_process(exit_on_idle: bool) -> None: @@ -63,13 +63,14 @@ class Orchestrator: - Each SnapshotWorker runs hooks sequentially for its snapshot """ - # Only CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator - WORKER_TYPES: list[Type[Worker]] = [CrawlWorker] + # BinaryWorker (singleton daemon) and CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator + WORKER_TYPES: list[Type[Worker]] = [BinaryWorker, CrawlWorker] # Configuration POLL_INTERVAL: float = 2.0 # How often to check for new work (seconds) IDLE_TIMEOUT: int = 3 # Exit after N idle ticks (0 = never exit) MAX_CRAWL_WORKERS: int = 8 # Max crawls processing simultaneously + MAX_BINARY_WORKERS: int = 1 # Max binaries installing simultaneously (sequential only) def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None): self.exit_on_idle = exit_on_idle @@ -194,15 +195,23 @@ class Orchestrator: return len(WorkerClass.get_running_workers()) def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool: - """Determine if we should spawn a new CrawlWorker.""" + """Determine if we should spawn a new worker.""" if queue_count == 0: return False - # Check CrawlWorker limit + # Get appropriate limit based on worker type + if WorkerClass.name == 'crawl': + max_workers = self.MAX_CRAWL_WORKERS + elif WorkerClass.name == 'binary': + max_workers = self.MAX_BINARY_WORKERS # Force sequential: only 1 binary at a time + else: + max_workers = 1 # Default for unknown types + + # Check worker limit running_workers = WorkerClass.get_running_workers() running_count = len(running_workers) - if running_count >= self.MAX_CRAWL_WORKERS: + if running_count >= max_workers: return False # Check if we already have enough workers for the queue size @@ -285,14 +294,35 @@ class Orchestrator: def check_queues_and_spawn_workers(self) -> dict[str, int]: """ - Check Crawl queue and spawn CrawlWorkers as needed. + Check Binary and Crawl queues and spawn workers as needed. Returns dict of queue sizes. """ from archivebox.crawls.models import Crawl + from archivebox.machine.models import Binary, Machine queue_sizes = {} - # Only check Crawl queue + # Check Binary queue + machine = Machine.current() + binary_queue = Binary.objects.filter( + machine=machine, + status=Binary.StatusChoices.QUEUED, + retry_at__lte=timezone.now() + ).order_by('retry_at') + binary_count = binary_queue.count() + queue_sizes['binary'] = binary_count + + # Spawn BinaryWorker if needed (one worker per binary, up to MAX_BINARY_WORKERS) + if self.should_spawn_worker(BinaryWorker, binary_count): + # Get next binary to process + binary = binary_queue.first() + if binary: + BinaryWorker.start(binary_id=str(binary.id)) + + # Check if any BinaryWorkers are still running + running_binary_workers = len(BinaryWorker.get_running_workers()) + + # Check Crawl queue crawl_queue = Crawl.objects.filter( retry_at__lte=timezone.now() ).exclude( @@ -307,12 +337,15 @@ class Orchestrator: crawl_count = crawl_queue.count() queue_sizes['crawl'] = crawl_count - # Spawn CrawlWorker if needed - if self.should_spawn_worker(CrawlWorker, crawl_count): - # Claim next crawl - crawl = crawl_queue.first() - if crawl and self._claim_crawl(crawl): - CrawlWorker.start(crawl_id=str(crawl.id)) + # CRITICAL: Only spawn CrawlWorkers if binary queue is empty AND no BinaryWorkers running + # This ensures all binaries are installed before snapshots start processing + if binary_count == 0 and running_binary_workers == 0: + # Spawn CrawlWorker if needed + if self.should_spawn_worker(CrawlWorker, crawl_count): + # Claim next crawl + crawl = crawl_queue.first() + if crawl and self._claim_crawl(crawl): + CrawlWorker.start(crawl_id=str(crawl.id)) return queue_sizes @@ -328,7 +361,7 @@ class Orchestrator: ) return updated == 1 - + def has_pending_work(self, queue_sizes: dict[str, int]) -> bool: """Check if any queue has pending work.""" return any(count > 0 for count in queue_sizes.values()) diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 9355649d..85a31224 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -323,6 +323,20 @@ class Worker: pwd = Path(snapshot.output_dir) # Run in snapshot's output directory env = get_config(snapshot=snapshot) + elif cls.name == 'binary': + # BinaryWorker processes a specific binary installation + binary_id = kwargs.get('binary_id') + if not binary_id: + raise ValueError("BinaryWorker requires binary_id") + + from archivebox.machine.models import Binary + binary = Binary.objects.get(id=binary_id) + + cmd = [sys.executable, '-m', 'archivebox', 'run', '--binary-id', str(binary_id)] + pwd = Path(settings.DATA_DIR) / 'machines' / str(Machine.current().id) / 'binaries' / binary.name / str(binary.id) + pwd.mkdir(parents=True, exist_ok=True) + env = get_config() + else: raise ValueError(f"Unknown worker type: {cls.name}") @@ -654,16 +668,8 @@ class SnapshotWorker(Worker): hooks = discover_hooks('Snapshot', config=config) hooks = sorted(hooks, key=lambda h: h.name) # Sort by name (includes step prefix) - import sys - print(f'[SnapshotWorker] Discovered {len(hooks)} hooks for snapshot {self.snapshot.url}', file=sys.stderr, flush=True) - if hooks: - print(f'[SnapshotWorker] First 5 hooks: {[h.name for h in hooks[:5]]}', file=sys.stderr, flush=True) - else: - print(f'[SnapshotWorker] WARNING: No hooks discovered! Config keys: {list(config.keys())[:10]}...', file=sys.stderr, flush=True) - # Execute each hook sequentially for hook_path in hooks: - print(f'[SnapshotWorker] Running hook: {hook_path.name}', file=sys.stderr, flush=True) hook_name = hook_path.name plugin = self._extract_plugin_name(hook_name) hook_step = extract_step(hook_name) @@ -829,8 +835,96 @@ class SnapshotWorker(Worker): return name +class BinaryWorker(Worker): + """ + Worker that processes a specific Binary installation. + + Like CrawlWorker and SnapshotWorker, BinaryWorker: + - Processes one specific binary (specified by binary_id) + - Installs it via Binary.run() which runs on_Binary__* hooks + - Exits when done + + Orchestrator spawns BinaryWorkers sequentially (MAX_BINARY_WORKERS=1) to avoid + conflicts during binary installations. + """ + + name: ClassVar[str] = 'binary' + MAX_TICK_TIME: ClassVar[int] = 600 # 10 minutes for binary installations + MAX_CONCURRENT_TASKS: ClassVar[int] = 1 # One binary per worker + + def __init__(self, binary_id: str, worker_id: int = 0): + self.binary_id = binary_id + super().__init__(worker_id=worker_id) + + def get_model(self): + from archivebox.machine.models import Binary + return Binary + + def get_next_item(self): + """Get the specific binary to install.""" + from archivebox.machine.models import Binary + + try: + return Binary.objects.get(id=self.binary_id) + except Binary.DoesNotExist: + return None + + def runloop(self) -> None: + """Install the specified binary.""" + import sys + + self.on_startup() + + try: + binary = self.get_next_item() + + if not binary: + log_worker_event( + worker_type='BinaryWorker', + event=f'Binary {self.binary_id} not found', + indent_level=1, + pid=self.pid, + ) + return + + print(f'[cyan]🔧 BinaryWorker installing: {binary.name}[/cyan]', file=sys.stderr) + + # Tick the state machine to trigger installation + # This calls BinaryMachine.on_install() -> Binary.run() -> on_Binary__* hooks + binary.sm.tick() + + # Check result + binary.refresh_from_db() + if binary.status == Binary.StatusChoices.INSTALLED: + log_worker_event( + worker_type='BinaryWorker', + event=f'Installed: {binary.name} -> {binary.abspath}', + indent_level=1, + pid=self.pid, + ) + else: + log_worker_event( + worker_type='BinaryWorker', + event=f'Installation pending: {binary.name} (status={binary.status})', + indent_level=1, + pid=self.pid, + ) + + except Exception as e: + log_worker_event( + worker_type='BinaryWorker', + event=f'Failed to install binary', + indent_level=1, + pid=self.pid, + error=e, + ) + finally: + self.on_shutdown() + + # Populate the registry WORKER_TYPES.update({ + 'binary': BinaryWorker, 'crawl': CrawlWorker, 'snapshot': SnapshotWorker, }) diff --git a/bin/test_plugins.sh b/bin/test_plugins.sh index cc21eca6..7a12bb94 100755 --- a/bin/test_plugins.sh +++ b/bin/test_plugins.sh @@ -239,6 +239,8 @@ for test_dir in $TEST_DIRS; do PYTEST_CMD="python -m pytest $test_dir -p no:django -v --tb=short" if [ "$ENABLE_COVERAGE" = true ]; then PYTEST_CMD="$PYTEST_CMD --cov=$plugin_name --cov-append --cov-branch" + echo "[DEBUG] NODE_V8_COVERAGE before pytest: $NODE_V8_COVERAGE" + python -c "import os; print('[DEBUG BASH->PYTHON] NODE_V8_COVERAGE:', os.environ.get('NODE_V8_COVERAGE', 'NOT_SET'))" fi if eval "$PYTEST_CMD" 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then diff --git a/old/TODO_hook_architecture.md b/old/TODO_hook_architecture.md index 5c8a7c56..00f3b86a 100755 --- a/old/TODO_hook_architecture.md +++ b/old/TODO_hook_architecture.md @@ -1878,7 +1878,7 @@ Updated `archivebox/core/statemachines.py`: |--------|------|--------|-------| | favicon | `on_Snapshot__11_favicon.py` | ✅ UPDATED | Now outputs clean JSONL | | git | `on_Snapshot__12_git.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | -| archive_org | `on_Snapshot__13_archive_org.py` | ✅ UPDATED | Now outputs clean JSONL | +| archivedotorg | `on_Snapshot__13_archivedotorg.py` | ✅ UPDATED | Now outputs clean JSONL | | title | `on_Snapshot__32_title.js` | ✅ UPDATED | Now outputs clean JSONL | | singlefile | `on_Snapshot__37_singlefile.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | | wget | `on_Snapshot__50_wget.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | @@ -1930,7 +1930,7 @@ The following hooks have been renamed with `.bg.` suffix: - `archivebox/core/migrations/0030_migrate_output_field.py` (new) ### Plugins Updated (Python Hooks) -- `archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py` +- `archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py` - `archivebox/plugins/favicon/on_Snapshot__11_favicon.py` - `archivebox/plugins/git/on_Snapshot__12_git.py` - `archivebox/plugins/media/on_Snapshot__51_media.py`