ArchiveBox/archivebox/parsers/pocket_api.py

118 lines
3.1 KiB
Python

__package__ = 'archivebox.parsers'
import re
from typing import IO, Iterable, Optional
from configparser import ConfigParser
import archivebox
from archivebox.config import CONSTANTS
from archivebox.misc.util import enforce_types
from archivebox.misc.system import atomic_write
from ..index.schema import Link
COUNT_PER_PAGE = 500
API_DB_PATH = CONSTANTS.SOURCES_DIR / 'pocket_api.db'
# search for broken protocols that sometimes come from the Pocket API
_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
def get_pocket_articles(api, since=None, page=0):
body, headers = api.get(
state='archive',
sort='oldest',
since=since,
count=COUNT_PER_PAGE,
offset=page * COUNT_PER_PAGE,
)
articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
returned_count = len(articles)
yield from articles
if returned_count == COUNT_PER_PAGE:
yield from get_pocket_articles(api, since=since, page=page + 1)
else:
api.last_since = body['since']
def link_from_article(article: dict, sources: list):
url: str = article.get('resolved_url') or article['given_url']
broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
if broken_protocol:
url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
title = article.get('resolved_title') or article.get('given_title') or url
return Link(
url=url,
timestamp=article['time_read'],
title=title,
tags=article.get('tags'),
sources=sources
)
def write_since(username: str, since: str):
if not API_DB_PATH.exists():
atomic_write(API_DB_PATH, '')
since_file = ConfigParser()
since_file.optionxform = str
since_file.read(API_DB_PATH)
since_file[username] = {
'since': since
}
with open(API_DB_PATH, 'w+') as new:
since_file.write(new)
def read_since(username: str) -> Optional[str]:
if not API_DB_PATH.exists():
atomic_write(API_DB_PATH, '')
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(API_DB_PATH)
return config_file.get(username, 'since', fallback=None)
@enforce_types
def should_parse_as_pocket_api(text: str) -> bool:
return text.startswith('pocket://')
@enforce_types
def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse bookmarks from the Pocket API"""
from pocket import Pocket
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
input_buffer.seek(0)
pattern = re.compile(r"^pocket:\/\/(\w+)")
for line in input_buffer:
if should_parse_as_pocket_api(line):
username = pattern.search(line).group(1)
api = Pocket(FLAT_CONFIG.POCKET_CONSUMER_KEY, FLAT_CONFIG.POCKET_ACCESS_TOKENS[username])
api.last_since = None
for article in get_pocket_articles(api, since=read_since(username)):
yield link_from_article(article, sources=[line])
write_since(username, api.last_since)
KEY = 'pocket_api'
NAME = 'Pocket API'
PARSER = parse_pocket_api_export