ArchiveBox/archivebox/extractors/extractor.py

import hashlib
import mimetypes
import os

import subprocess
from typing import ClassVar
from datetime import timedelta
from zipfile import Path

from django.utils import timezone

from archivebox.misc.hashing import get_dir_info

from core.models import ArchiveResult

import abx
import archivebox

# class Extractor:
#     # static class variables
#     name: ClassVar[str] = 'ytdlp'
#     verbose_name: ClassVar[str] = 'YT-DLP'
#     binaries: ClassVar[tuple[str, ...]] = ()
#     daemons: ClassVar[tuple[str, ...]] = ()
#     timeout: ClassVar[int] = 60
#
#     # instance variables
#     ARCHIVERESULT: ArchiveResult
#     CONFIG: dict[str, object]
#     BINARIES: dict[str, object]
#     DAEMONS: dict[str, object]
#
#     def __init__(self, archiveresult: ArchiveResult, extra_config: dict | None=None):
#         assert archiveresult.pk, 'ArchiveResult must be saved to DB before it can be extracted'
#         self.archiveresult = self.ARCHIVERESULT = archiveresult
#         self.CONFIG = archivebox.pm.hook.get_SCOPE_CONFIG(archiveresult=self.archiveresult, extra=extra_config)
#         all_binaries = abx.as_dict(archivebox.pm.hook.get_BINARIES())
#         all_daemons = abx.as_dict(archivebox.pm.hook.get_DAEMONS())
#         self.BINARIES = {
#             binary_name: all_binaries[binary_name]
#             for binary_name in self.binaries
#         }
#         self.DAEMONS = {
#             daemon_name: all_daemons[daemon_name]
#             for daemon_name in self.daemons
#         }

#     def extract(self, config: dict | None=None) -> 'ArchiveResult':
#         """
#         - making sure any binaries the extractor depends on are installed and loaded
#         - creating a new temporary working directory under the snapshot dir to hold extractor output
#         - setting up a timer signal to kill the extractor if it runs too long
#         - passing the extractor the URLs, temporary working directory, and config dict of options
#         - running the extractor in a shell subprocess and collecting stdout/stderr
#         - capturing the extractor's exit code
#         - if extractor exits with 29 (RetryError), it should set the status to 'BACKOFF' and set retry_at to a datetime in the future
#         - if extractor exits with 50 (NotApplicable), it should set the status to 'SKIPPED', and set retry_at to None
#         - setting the correct permissions and ownership on all the output files
#         - generating the merkle tree of all the output files and their hashes
#         - generating a thumbnail of the main output (or collecting one provided by the extractor)
#         - detecting any special outputs files that need to be parsed for other parts of the system (content-types? )
#             - metadata.json -> ArchiveResult.output_json
#             - outlinks.jsonl -> ArchiveResult.output_links
#             - search_texts.txt -> ArchiveResult.index_texts
#             - .merkle.json -> ArchiveResult.output_files
#             - videos.jsonl -> ArchiveResult.output_videos
#             - audios.jsonl -> ArchiveResult.output_audios
#             - images.jsonl -> ArchiveResult.output_images
#             - htmls.jsonl -> ArchiveResult.output_htmls
#         - saving all the result metadata to the ArchiveResult in the database
#         """

#         archiveresult = self.ARCHIVERESULT
#         # config = get_scope_config(archiveresult=archiveresult.snapshot.url, env=...)

#         self.before_extract()

#         error = Exception('Failed to start extractor')
#         stdout = ''
#         stderr = ''
#         try:
#             proc = archiveresult.EXTRACTOR.spawn(url=archiveresult.snapshot.url, binaries=binaries, daemons=daemons, cwd=cwd, config=config)
#             stdout, stderr = proc.communicate()
#             error = None
#         except Exception as err:
#             error = err
#         finally:
#             self.after_extract(error=error)

#         return archiveresult

#     def should_extract(self):
#         if self.archiveresult.snapshot.url.startswith('https://youtube.com/'):
#             return True
#         return False

#     def load_binaries(self):
#         return {
#             bin_name: binary.load()
#             for bin_name, binary in self.BINARIES.items()
#         }

#     def load_daemons(self):
#         return {
#             daemon_name: daemon.load()
#             for daemon_name, daemon in self.DAEMONS.items()
#         }

#     def output_dir_name(self):
#         # e.g. 'ytdlp'
#         return f'{self.name}'

#     @property
#     def OUTPUT_DIR(self):
#         return self.archiveresult.snapshot_dir / self.output_dir_name()

#     def before_extract(self):
#         # create self.archiveresult.snapshot_dir / self.archiveresult.extractor / dir
#         # chown, chmod, etc.
#         binaries = self.load_binaries()
#         daemons = self.load_daemons()
#         cmd = self.archiveresult.EXTRACTOR.get_cmd(binaries=binaries, daemons=daemons)
#         cmd_version = self.archiveresult.EXTRACTOR.get_cmd_version(binaries=binaries, daemons=daemons)

#         self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
#         os.chmod(self.OUTPUT_DIR, 0o755)
#         self.archiveresult.status = self.archiveresult.StatusChoices.STARTED
#         self.archiveresult.retry_at = timezone.now() + timedelta(seconds=self.timeout)
#         self.archiveresult.start_ts = timezone.now()
#         self.archiveresult.end_ts = None
#         self.archiveresult.output = None
#         self.archiveresult.output_path = str(self.OUTPUT_DIR.relative_to(self.archiveresult.snapshot_dir))
#         self.archiveresult.cmd = cmd
#         self.archiveresult.cmd_version = cmd_version
#         self.archiveresult.machine = Machine.objects.get_current()
#         self.archiveresult.iface = NetworkInterface.objects.get_current()
#         self.archiveresult.save()
#         self.archiveresult.write_indexes()

#     def extract(self, url: str, binaries: dict, daemons: dict, cwd: Path, config: dict):
#         proc = subprocess.run(self.archiveresult.cmd, cwd=self.archiveresult.cwd, env=os.environ.update(binaries), timeout=self.timeout, shell=True, capture_output=True, text=True)
#         self.archiveresult.stdout = proc.stdout
#         self.archiveresult.stderr = proc.stderr
#         self.archiveresult.returncode = proc.returncode
#         self.archiveresult.save()
#         self.archiveresult.write_indexes()

#     def determine_status(self):
#         if self.archiveresult.returncode == 29:
#             return self.archiveresult.StatusChoices.BACKOFF, timezone.now() + timedelta(seconds=self.timeout)
#         elif self.archiveresult.returncode == 50:
#             return self.archiveresult.StatusChoices.SKIPPED, None
#         else:
#             return self.archiveresult.StatusChoices.FAILED, None

#     def collect_outputs(self, cwd: Path):
#         for file in cwd.rglob('*'):
#             path = file.relative_to(cwd)
#             os.chmod(file, 0o644)
#             #os.chown(file, ARCHIVEBOX_UID, ARCHIVEBOX_GID)

#             self.archiveresult.outputs.append({
#                 'type': 'FILE',
#                 'path': file.relative_to(cwd),
#                 'size': file.stat().st_size,
#                 'ext': file.suffix,
#                 'mimetype': mimetypes.guess_type(file)[0],
#                 'sha256': hashlib.sha256(file.read_bytes()).hexdigest(),
#                 'blake3': hashlib.blake3(file.read_bytes()).hexdigest(),
#                 'created_at': file.stat().st_ctime,
#                 'modified_at': file.stat().st_mtime,
#                 'symlinks': [
#                     'screenshot.png',
#                     'example.com',
#                 ]
#             })
#             outlinks = parse_outlinks(file)
#             if outlinks:
#                 self.archiveresult.outputs.append({
#                     'type': 'OUTLINK',
#                     'url': outlink.target,
#                     'selector': outlink.selector,
#                     'text': outlink.text,
#                 })
#
#            if path.endswith('favicon.ico'):
#                self.archiveresult.outputs.append({
#                    'type': 'FAVICON',
#                    'symlinks': {
#                        'favicon': output_file['path'],
#                        'favicon.ico': output_file['path'],
#                        'favicon.png': output_file['path'].with_suffix('.png'),
#                    },
#                    'path': output_file['path'],
#                })
#            if path.endswith('.pdf'):
#                self.archiveresult.outputs.append({
#                    'type': 'PDF',
#                    'path': file.relative_to(cwd),
#                })
#
#             if 'text/plain' in mimetypes.guess_type(file):
#                 self.archiveresult.outputs.append({
#                     'type': 'SEARCHTEXT',
#                     'path': file.relative_to(self.archiveresult.OUTPUT_DIR),
#                     'archiveresult_id': self.archiveresult.id,
#                 })
#
#    def after_extract(self, error: Exception | None=None):
#        status, retry_at = self.determine_status()
#
#         self.archiveresult.error = f'{type(error).__name__}: {error}' if error else None
#         self.archiveresult.status = self.archiveresult.StatusChoices.FAILED if error else self.archiveresult.StatusChoices.SUCCEEDED
#         self.archiveresult.retry_at = None
#         self.archiveresult.end_ts = timezone.now()
#         self.archiveresult.output = self.archiveresult.outputs[0].path
#         self.archiveresult.save()
#         self.archiveresult.write_indexes()