105 lines
3.2 KiB
Python
105 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
|
|
|
__package__ = 'archivebox.cli'
|
|
__command__ = 'archivebox remove'
|
|
|
|
import shutil
|
|
from pathlib import Path
|
|
from typing import Iterable
|
|
|
|
import rich_click as click
|
|
|
|
from django.db.models import QuerySet
|
|
|
|
from archivebox.config import DATA_DIR
|
|
from archivebox.index.schema import Link
|
|
from archivebox.config.django import setup_django
|
|
from archivebox.index import load_main_index
|
|
from archivebox.index.sql import remove_from_sql_main_index
|
|
from archivebox.misc.util import enforce_types, docstring
|
|
from archivebox.misc.checks import check_data_folder
|
|
from archivebox.misc.logging_util import (
|
|
log_list_started,
|
|
log_list_finished,
|
|
log_removal_started,
|
|
log_removal_finished,
|
|
TimedProgress,
|
|
)
|
|
|
|
|
|
@enforce_types
|
|
def remove(filter_patterns: Iterable[str]=(),
|
|
filter_type: str='exact',
|
|
snapshots: QuerySet | None=None,
|
|
after: float | None=None,
|
|
before: float | None=None,
|
|
yes: bool=False,
|
|
delete: bool=False,
|
|
out_dir: Path=DATA_DIR) -> Iterable[Link]:
|
|
"""Remove the specified URLs from the archive"""
|
|
|
|
setup_django()
|
|
check_data_folder()
|
|
|
|
from archivebox.cli.archivebox_search import list_links
|
|
|
|
list_kwargs = {
|
|
"filter_patterns": filter_patterns,
|
|
"filter_type": filter_type,
|
|
"after": after,
|
|
"before": before,
|
|
}
|
|
if snapshots:
|
|
list_kwargs["snapshots"] = snapshots
|
|
|
|
log_list_started(filter_patterns, filter_type)
|
|
timer = TimedProgress(360, prefix=' ')
|
|
try:
|
|
snapshots = list_links(**list_kwargs)
|
|
finally:
|
|
timer.end()
|
|
|
|
if not snapshots.exists():
|
|
log_removal_finished(0, 0)
|
|
raise SystemExit(1)
|
|
|
|
log_links = [link.as_link() for link in snapshots]
|
|
log_list_finished(log_links)
|
|
log_removal_started(log_links, yes=yes, delete=delete)
|
|
|
|
timer = TimedProgress(360, prefix=' ')
|
|
try:
|
|
for snapshot in snapshots:
|
|
if delete:
|
|
shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
|
|
finally:
|
|
timer.end()
|
|
|
|
to_remove = snapshots.count()
|
|
|
|
from archivebox.search import flush_search_index
|
|
|
|
flush_search_index(snapshots=snapshots)
|
|
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
|
|
all_snapshots = load_main_index(out_dir=out_dir)
|
|
log_removal_finished(all_snapshots.count(), to_remove)
|
|
|
|
return all_snapshots
|
|
|
|
|
|
@click.command()
|
|
@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
|
|
@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
|
|
@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
|
|
@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
|
|
@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
|
|
@click.argument('filter_patterns', nargs=-1)
|
|
@docstring(remove.__doc__)
|
|
def main(**kwargs):
|
|
"""Remove the specified URLs from the archive"""
|
|
remove(**kwargs)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|