ArchiveBox_ArchiveBox/archivebox/cli/archivebox_install.py

#!/usr/bin/env python3

__package__ = 'archivebox.cli'

import os
import sys
import shutil

import rich_click as click
from rich import print

from archivebox.misc.util import docstring, enforce_types


@enforce_types
def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bool = False) -> None:
    """Detect and install ArchiveBox dependencies by running a dependency-check crawl

    Examples:
        archivebox install                              # Install all dependencies
        archivebox install wget curl                    # Install only wget and curl
        archivebox install --binproviders=pip yt-dlp    # Install yt-dlp using only pip
        archivebox install --binproviders=brew,apt      # Install all deps using only brew or apt
    """

    from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
    from archivebox.config.paths import ARCHIVE_DIR
    from archivebox.misc.logging import stderr
    from archivebox.cli.archivebox_init import init

    if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
        init()  # must init full index because we need a db to store Binary entries in

    # Show what we're installing
    if binaries:
        print(f'\n[green][+] Installing specific binaries: {", ".join(binaries)}[/green]')
    else:
        print('\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]')

    if binproviders != '*':
        print(f'[green][+] Using providers: {binproviders}[/green]')

    if IS_ROOT:
        EUID = os.geteuid()
        print()
        print(f'[yellow]:warning:  Running as UID=[blue]{EUID}[/blue].[/yellow]')
        print(f'    DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
        print()

    if dry_run:
        print('[dim]Dry run - would create a crawl to detect dependencies[/dim]')
        return

    # Set up Django
    from archivebox.config.django import setup_django
    setup_django()

    from django.utils import timezone
    from archivebox.crawls.models import Crawl
    from archivebox.base_models.models import get_or_create_system_user_pk

    # Create a crawl for dependency detection
    # Using a minimal crawl that will trigger on_Crawl hooks
    created_by_id = get_or_create_system_user_pk()

    # Build config for this crawl using existing PLUGINS filter
    crawl_config = {}

    # Combine binary names and provider names into PLUGINS list
    plugins = []
    if binaries:
        plugins.extend(binaries)
    if binproviders != '*':
        plugins.extend(binproviders.split(','))

    if plugins:
        crawl_config['PLUGINS'] = ','.join(plugins)

    crawl, created = Crawl.objects.get_or_create(
        urls='archivebox://install',
        defaults={
            'label': 'Dependency detection',
            'created_by_id': created_by_id,
            'max_depth': 0,
            'status': 'queued',
            'config': crawl_config,
        }
    )

    # If crawl already existed, reset it to queued state so it can be processed again
    if not created:
        crawl.status = 'queued'
        crawl.retry_at = timezone.now()
        crawl.config = crawl_config  # Update config
        crawl.save()

    print(f'[+] Created dependency detection crawl: {crawl.id}')
    if crawl_config:
        print(f'[+] Crawl config: {crawl_config}')
    print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')

    # Verify the crawl is in the queue
    from archivebox.crawls.models import Crawl as CrawlModel
    queued_crawls = CrawlModel.objects.filter(
        retry_at__lte=timezone.now()
    ).exclude(
        status__in=CrawlModel.FINAL_STATES
    )
    print(f'[+] Crawls in queue: {queued_crawls.count()}')
    if queued_crawls.exists():
        for c in queued_crawls:
            print(f'    - Crawl {c.id}: status={c.status}, retry_at={c.retry_at}')

    print('[+] Running crawl to detect binaries via on_Crawl hooks...')
    print()

    # Run the crawl synchronously (this triggers on_Crawl hooks)
    from archivebox.workers.orchestrator import Orchestrator
    orchestrator = Orchestrator(exit_on_idle=True)
    orchestrator.runloop()

    print()

    # Check for superuser
    from django.contrib.auth import get_user_model
    User = get_user_model()

    if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
        stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
        stderr('    archivebox manage createsuperuser')

    print()

    # Show version to display full status including installed binaries
    # Django is already loaded, so just import and call the function directly
    from archivebox.cli.archivebox_version import version as show_version
    show_version(quiet=False)


@click.command()
@click.argument('binaries', nargs=-1, type=str, required=False)
@click.option('--binproviders', '-p', default='*', help='Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all', show_default=True)
@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
@docstring(install.__doc__)
def main(**kwargs) -> None:
    install(**kwargs)


if __name__ == '__main__':
    main()