Visualizing Github Network Using Networkx

Let’s install a bunch of stuff. You’ll need a virtual environment activated, or be running as root, otherwise your system will yell at you.

In [ ]:
!pip install requests networkx matplotlib scipy numpy


Cool, now let’s import a bunch of stuff, and tell matplotlib that we want to see the graphs, instead of saving them to a file or something.

In [ ]:
import os
import json

import requests
import networkx as nx
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


Let’s try and find the community following my Github. We’ll ping the Github GraphQL using an OAuth Token that you can get by following these directions, and signing the prerelease agreement. I used Github’s GraphQL API Explorer to help format the queries. We’ll get our followers, and everyone that they are following. You can see the queries here.

In [ ]:
github_graphql_url = 'https://api.github.com/graphql'

# I'm using envirnomental variables so I can distribute this without leaking my token
# a normal string will do
oauth_token = os.getenv('GITHUB_OAUTH_TOKEN')

# format strings in python 3.6+ are pretty neat

In [ ]:
#ignore
query = '''
query {{
viewer {{
followers ({follower_args})
{{
pageInfo {{
hasNextPage
endCursor
}}
nodes {{
following (first: 100) {{
pageInfo {{
hasNextPage
endCursor
}}
edges {{
node {{
}}
}}
}}
}}
}}
}}
}}
'''

user_query ='''
query {{
... on User {{
following(first: 100 after: "{cursor}") {{
totalCount
pageInfo {{
endCursor
hasNextPage
}}
edges {{
node {{
}}
}}
}}
}}
}}
}}
'''

initial_follower_args = 'first: 100'
cursor_follower_args = "first: 100 after: \"{cursor}\""


I don’t want to remember how to get the data out while I’m handeling the parsing of the data (which will be the majority of the code), so let’s define helper methods to get the data.

In [ ]:
def get_data(query):
# Use the json dumps method to translate the string into a dict
data = json.dumps({'query': query})
# We'll use requests, passing in the URL, query data, and header info
r = requests.post(github_graphql_url,
data=data,

# Return the json data, stripping out the first key
return r.json()['data']

# Define a helper method that deals with the cursor logic
# If there's a cursor, put it into our query
if cursor:
args = cursor_follower_args.format(cursor=cursor)
# Else, we'll just tell Github we want the first 100 followers
else:
args = initial_follower_args

# We need to put the args into our query
data_query = query.format(follower_args=args)

# Get the data
data = get_data(data_query)

# Strip out the follower data and our login name and return it
follower_data = data['viewer']['followers']


We’ll also define a few methods to help us deal with the deeply nested JSON reply that we’re going to get back.

In [ ]:
def get_followers_and_next_page(my_follower_data: dict) -> (list, bool):
# who is following me?
my_followers = my_follower_data['nodes']
# do I have more than one page of followers?
has_next_page = my_follower_data['pageInfo']['hasNextPage']
# what is the cursor string that for use in the next query?
cursor = my_follower_data['pageInfo']['endCursor']

return my_followers, has_next_page, cursor

def get_first_layer_followers_and_next_page(a_follower: dict) -> (list,
bool):

# Who is this user following?
following = a_follower['following']

# Is there a next page?
has_next_page = following['pageInfo']['hasNextPage']
# what is the cursor string that for use in the next query?
cursor = my_follower_data['pageInfo']['endCursor']
following = following['edges']
following = [x['node']['login'] for x in following]

return following, has_next_page, cursor


We’ll be using networkx to help us do the network graph.

In [ ]:
# setup the graph
graph = nx.Graph()


Now let’s get/parse the data and add it to the graph.

In [ ]:
my_follower_data, user_login_name = get_follower_and_login_data()

# Initial variable setup
has_next_page = True
on_last_page = False
followers_with_many_following = {}

# Continue to parse while there is a next page, or we're not on the
# last page
while has_next_page or not on_last_page:
results = get_followers_and_next_page(my_follower_data)

my_followers = results[0]
has_next_page = results[1]
cursor = results[2]

# Add all of my followers to the graph, and capture the connection
for follower in my_followers:
# Add them to our graphs

# Now grab who they are following
results = get_first_layer_followers_and_next_page(follower)

# Again, who our followers are following
following = results[0]
following_has_next_page = results[1]
following_cursor = results[2]

# Add all the people our followers are following

# Some of the people we're following will have more than
# 100 followers, so save those
if following_has_next_page:

# Loop maintainence
if has_next_page == False:
on_last_page = True
else:

while followers_with_many_following:
keys = tuple(followers_with_many_following.keys())
for key in keys:
cursor = followers_with_many_following[key]
follower_data = data['repositoryOwner']['following']

if (not follower_data['pageInfo']['hasNextPage']
or follower_data['totalCount'] > 1000):
followers_with_many_following.pop(key)
else:
cursor = follower_data['pageInfo']['endCursor']
followers_with_many_following[key] = cursor


In [ ]:
#ignore
# https://github.com/tpoisot/nxfa2
def forceatlas2_layout(G, iterations=10, linlog=False, pos=None, nohubs=False,
kr=0.001, k=None, dim=2):
"""
Options values are
g                The graph to layout
iterations       Number of iterations to do
linlog           Whether to use linear or log repulsion
avoidoverlap     Whether to avoid overlap of points
degreebased      Degree based repulsion
"""
# We add attributes to store the current and previous convergence speed
for n in G:
G.node[n]['prevcs'] = 0
G.node[n]['currcs'] = 0
# To numpy matrix
# This comes from the spares FR layout in nx
A = nx.to_scipy_sparse_matrix(G, dtype='f')
nnodes, _ = A.shape

try:
A = A.tolil()
except Exception as e:
A = (coo_matrix(A)).tolil()
if pos is None:
pos = np.asarray(np.random.random((nnodes, dim)), dtype=A.dtype)
else:
pos = pos.astype(A.dtype)
if k is None:
k = np.sqrt(1.0 / nnodes)
# Iterations
# the initial "temperature" is about .1 of domain area (=1x1)
# this is the largest step allowed in the dynamics.
t = 0.1
# simple cooling scheme.
# linearly step down by dt on each iteration so last iteration is size dt.
dt = t / float(iterations + 1)
displacement = np.zeros((dim, nnodes))
for iteration in range(iterations):
displacement *= 0
# loop over rows
for i in range(A.shape[0]):
# difference between this row's node position and all others
delta = (pos[i] - pos).T
# distance between points
distance = np.sqrt((delta ** 2).sum(axis=0))
# enforce minimum distance of 0.01
distance = np.where(distance < 0.01, 0.01, distance)
Ai = np.asarray(A.getrowview(i).toarray())
# displacement "force"
Dist = k * k / distance ** 2
if nohubs:
Dist = Dist / float(Ai.sum(axis=1) + 1)
if linlog:
Dist = np.log(Dist + 1)
displacement[:, i] += \
(delta * (Dist - Ai * distance / k)).sum(axis=1)
# update positions
length = np.sqrt((displacement ** 2).sum(axis=0))
length = np.where(length < 0.01, 0.01, length)
pos += (displacement * t / length).T
# cool temperature
t -= dt
# Return the layout
return dict(zip(G, pos))


We’ll use the alogrithm Force Atlas 2 to deal with the spatialization of our Network Graph. The implementation is here.

In [ ]:
positions = forceatlas2_layout(graph, linlog=False, nohubs=False, iterations=100)

In [ ]:
nx.draw(graph, positions, node_size=1, with_labels=False)
plt.savefig('images/github-analysis.png')
plt.show()


Looks like my Github followers don’t form much of a traditional social network.

In [ ]:
print(nx.info(graph))

By @Ben Hoff in
Tags :