reddit-communities/igraph_analyze.py

55 lines
1.7 KiB
Python

import sqlite3
import igraph
"""Work with the graph from DB."""
def reddit():
"""Return an igraph instance of Reddit from data/reddit.db"""
conn = sqlite3.connect('data/reddit.db')
cursor = conn.cursor()
# TODO: Use the dbmodel.
vertices = set([a[0] for a in cursor.execute("select name, subscribers from subreddits")])
edges = set([(a, b) for a, b in cursor.execute("select * from mapping") if (a in vertices) and (b in vertices)])
g = igraph.Graph(directed=True)
g.add_vertices(list(vertices))
g.add_edges(list(edges))
return g
def get_shortest_paths(graph, node, to):
l = [[graph.vs(a)['name'] for a in path]
for path in graph.get_all_shortest_paths(node, to=to)]
return l
def print_shortest_paths(graph, node, to):
for path in graph.get_all_shortest_paths(node, to=to):
for node in path:
print graph.vs[node]['name'],
print
def top(n, g):
"""Return the list of top N subreddits, by in-degree, where g is the
graph over Reddit. In some sense, these can be viewed as the most
popular subreddits.
This revealed some (mildly) interesting facts about Reddit.
The default for this function is to take all subreddits into
consideration. If we start filtering based on number of
subscribers, we see different results.
"""
degrees = [(i['name'], g.degree(i, type="in")) for i in g.vs]
top = sorted(degrees, key=lambda a: a[1], reverse=True)[0:n]
return top
if __name__ == "__main__":
g = reddit()
for i, subreddit in enumerate(top(100, g)):
print i, ":", subreddit[0]
#glayout = g.layout("fr")
# Top 10
# >>> sorted([(i['name'], g.degree(i, type="in")) for i in g.vs], key=lambda a: a[1], reverse=True)[0:10]