reddit-communities/crawler.py

107 lines
3.1 KiB
Python

# -*- coding: utf-8 -*-
import sys
import re
from collections import deque
import argparse
from requests import get
from dbmodel import DBModel
"""Crawl reddit from a starting point, look at the sidebar and spider
to the links there."""
headers = {'user-agent': '/u/benediktkr https://github.com/benediktkr/reddit-communities'}
def fix(s):
if s[0:2] == "r/":
s = "/" + s
if s[-1] != "/":
s = s + "/"
return s.lower()
def parse_sidebar(subreddit):
subreddit = fix(subreddit)
url = 'http://www.reddit.com/{0}/about.json'.format(subreddit[1:-1])
try:
json = get(url, headers=headers).json()
error = False
except ValueError as ve:
print subreddit + ": " + str(ve)
f = open('errors.txt', 'a')
f.write(get(url).text.encode("utf-8"))
f.close()
error = True
if error or "error" in json:
return {'name': subreddit,
'links': [],
'subscribers': 0,
'created': None,
'error': "dunno" if error else json['error']}
if json['data']['description']:
regex = re.findall(r'/r/\w+/?', json['data']['description'])
links = list(set([fix(a) for a in regex if fix(a) != subreddit]))
else:
links = []
subscribers = int(json['data']['subscribers'])
created = int(json['data']['created'])
return {'name': subreddit,
'links': links,
'subscribers': subscribers,
'created': created}
def bfs(root, fname, max_depth):
current_depth = 0
if not fname:
fname = fix(root)[3:-1] + ".txt"
f = open(fname, 'w')
#db = DBModel.get()
todo = deque()
visited = set()
todo.append(root)
todo.append(None)
while len(todo) > 0:
here = todo.popleft()
if here is None:
if max_depth and current_depth >= max_depth:
print "Exiting at dept", current_depth
break
current_depth += 1
todo.append(None)
print current_depth
continue
if here in visited:
continue
visited.add(here)
this_subreddit = parse_sidebar(here)
#db.save_subreddit(this_subreddit)
for subreddit in this_subreddit['links']:
if subreddit not in visited:
todo.append(subreddit)
link = (here, subreddit)
slink = "{} -> {}".format(here, subreddit)
f.write(slink + "\n")
print slink
# print link
#db.save_link(link)
# db.close()
f.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="reddit-communities")
parser.add_argument('root', metavar='root', type=str, help="Subreddit to use as root for crawler")
parser.add_argument('--depth', type=int, default=None, dest="max_depth", help="Depth for BFS search")
parser.add_argument('--fname', type=str, default=None, dest="fname", help="Filename to write CSV to")
args = parser.parse_args()
import pprint
pprint.pprint(parse_sidebar(args.root))
bfs(**vars(args))