Visualizing Github Network Using Networkx

Published: Sun 09 July 2017

In Random.

Let’s install a bunch of stuff. You’ll need a virtual environment activated, or be running as root, otherwise your system will yell at you.

In [ ]:
!pip install requests networkx matplotlib scipy numpy

Cool, now let’s import a bunch of stuff, and tell matplotlib that we want to see the graphs, instead of saving them to a file or something.

In [ ]:
import os
import json


import requests
import networkx as nx
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Let’s try and find the community following my Github. We’ll ping the Github GraphQL using an OAuth Token that you can get by following these directions, and signing the prerelease agreement. I used Github’s GraphQL API Explorer to help format the queries. We’ll get our followers, and everyone that they are following. You can see the queries here.

In [ ]:
github_graphql_url = 'https://api.github.com/graphql'

# I'm using envirnomental variables so I can distribute this without leaking my token
# a normal string will do
oauth_token = os.getenv('GITHUB_OAUTH_TOKEN')

# format strings in python 3.6+ are pretty neat
headers={'Authorization': f'bearer {oauth_token}'}
In [ ]:
#ignore
query = '''
query {{
viewer {{
  login
  followers ({follower_args})
  {{
    pageInfo {{
      hasNextPage
      endCursor
    }}
    nodes {{
      login
      following (first: 100) {{
        pageInfo {{
          hasNextPage
          endCursor
        }}
        edges {{
          node {{
            login
          }}
        }}
      }}
    }}
  }}
}}
}}
'''

user_query ='''
query {{
repositoryOwner(login: "{login}"){{
  ... on User {{
    following(first: 100 after: "{cursor}") {{
      totalCount
      pageInfo {{
        endCursor
        hasNextPage
      }}
      edges {{
        node {{
          login
        }}
      }}
    }}
  }}
}}
}}
'''

initial_follower_args = 'first: 100'
cursor_follower_args = "first: 100 after: \"{cursor}\""

I don’t want to remember how to get the data out while I’m handeling the parsing of the data (which will be the majority of the code), so let’s define helper methods to get the data.

In [ ]:
def get_data(query):
    # Use the `json` dumps method to translate the string into a dict
    data = json.dumps({'query': query})
    # We'll use requests, passing in the URL, query data, and header info
    r = requests.post(github_graphql_url,
                      data=data,
                      headers=headers)

    # Return the json data, stripping out the first key
    return r.json()['data']

# Define a helper method that deals with the cursor logic
def get_follower_and_login_data(cursor=None):
    # If there's a cursor, put it into our query
    if cursor:
        args = cursor_follower_args.format(cursor=cursor)
    # Else, we'll just tell Github we want the first 100 followers
    else:
        args = initial_follower_args

    # We need to put the args into our query
    data_query = query.format(follower_args=args)

    # Get the data
    data = get_data(data_query)

    # Strip out the follower data and our login name and return it
    follower_data = data['viewer']['followers']
    user_login_name = data['viewer']['login']
    return follower_data, user_login_name

We’ll also define a few methods to help us deal with the deeply nested JSON reply that we’re going to get back.

In [ ]:
def get_followers_and_next_page(my_follower_data: dict) -> (list, bool):
    # who is following me?
    my_followers = my_follower_data['nodes']
    # do I have more than one page of followers?
    has_next_page = my_follower_data['pageInfo']['hasNextPage']
    # what is the cursor string that for use in the next query?
    cursor = my_follower_data['pageInfo']['endCursor']
    
    return my_followers, has_next_page, cursor

def get_first_layer_followers_and_next_page(a_follower: dict) -> (list,
                                                                  bool):
    
    # Who is this user following?
    following = a_follower['following']

    # Is there a next page?
    has_next_page = following['pageInfo']['hasNextPage']
    # what is the cursor string that for use in the next query?
    cursor = my_follower_data['pageInfo']['endCursor']
    following = following['edges']
    following = [x['node']['login'] for x in following]
    
    return following, has_next_page, cursor

We’ll be using networkx to help us do the network graph.

In [ ]:
# setup the graph
graph = nx.Graph()

Now let’s get/parse the data and add it to the graph.

In [ ]:
my_follower_data, user_login_name = get_follower_and_login_data()
graph.add_node(user_login_name)

# Initial variable setup
has_next_page = True
on_last_page = False
followers_with_many_following = {}

# Continue to parse while there is a next page, or we're not on the
# last page
while has_next_page or not on_last_page:
    results = get_followers_and_next_page(my_follower_data)
    
    my_followers = results[0]
    has_next_page = results[1]
    cursor = results[2]

    # Add all of my followers to the graph, and capture the connection
    for follower in my_followers:
        follower_login = follower['login']
        # Add them to our graphs
        graph.add_node(follower_login)
        graph.add_edge(follower_login, user_login_name)

        # Now grab who they are following
        results = get_first_layer_followers_and_next_page(follower)
        
        # Again, who our followers are following
        following = results[0]
        following_has_next_page = results[1]
        following_cursor = results[2]

        # Add all the people our followers are following
        for login in following:
            graph.add_node(login)
            graph.add_edge(follower_login, login)
            
        # Some of the people we're following will have more than
        # 100 followers, so save those
        if following_has_next_page:
            followers_with_many_following[follower_login] = following_cursor
            
    # Loop maintainence
    if has_next_page == False:
        on_last_page = True
    else:
        my_follower_data, _ = get_follower_and_login_data(cursor)
   
while followers_with_many_following:
    keys = tuple(followers_with_many_following.keys())
    for key in keys:
        cursor = followers_with_many_following[key]
        data = get_data(user_query.format(login=key, cursor=cursor))
        follower_data = data['repositoryOwner']['following']
        
        if (not follower_data['pageInfo']['hasNextPage']
            or follower_data['totalCount'] > 1000):
            followers_with_many_following.pop(key)
        else:
            cursor = follower_data['pageInfo']['endCursor']
            followers_with_many_following[key] = cursor
            
        logins = [x['node']['login'] for x in follower_data['edges']]
        for login in logins:
            graph.add_node(login)
            graph.add_edge(key, login)
In [ ]:
#ignore
# https://github.com/tpoisot/nxfa2
def forceatlas2_layout(G, iterations=10, linlog=False, pos=None, nohubs=False,
                       kr=0.001, k=None, dim=2):
    """
    Options values are
    g                The graph to layout
    iterations       Number of iterations to do
    linlog           Whether to use linear or log repulsion
    random_init      Start with a random position
                     If false, start with FR
    avoidoverlap     Whether to avoid overlap of points
    degreebased      Degree based repulsion
    """
    # We add attributes to store the current and previous convergence speed
    for n in G:
        G.node[n]['prevcs'] = 0
        G.node[n]['currcs'] = 0
        # To numpy matrix
    # This comes from the spares FR layout in nx
    A = nx.to_scipy_sparse_matrix(G, dtype='f')
    nnodes, _ = A.shape

    try:
        A = A.tolil()
    except Exception as e:
        A = (coo_matrix(A)).tolil()
    if pos is None:
        pos = np.asarray(np.random.random((nnodes, dim)), dtype=A.dtype)
    else:
        pos = pos.astype(A.dtype)
    if k is None:
        k = np.sqrt(1.0 / nnodes)
        # Iterations
    # the initial "temperature" is about .1 of domain area (=1x1)
    # this is the largest step allowed in the dynamics.
    t = 0.1
    # simple cooling scheme.
    # linearly step down by dt on each iteration so last iteration is size dt.
    dt = t / float(iterations + 1)
    displacement = np.zeros((dim, nnodes))
    for iteration in range(iterations):
        displacement *= 0
        # loop over rows
        for i in range(A.shape[0]):
            # difference between this row's node position and all others
            delta = (pos[i] - pos).T
            # distance between points
            distance = np.sqrt((delta ** 2).sum(axis=0))
            # enforce minimum distance of 0.01
            distance = np.where(distance < 0.01, 0.01, distance)
            # the adjacency matrix row
            Ai = np.asarray(A.getrowview(i).toarray())
            # displacement "force"
            Dist = k * k / distance ** 2
            if nohubs:
                Dist = Dist / float(Ai.sum(axis=1) + 1)
            if linlog:
                Dist = np.log(Dist + 1)
            displacement[:, i] += \
                (delta * (Dist - Ai * distance / k)).sum(axis=1)
            # update positions
        length = np.sqrt((displacement ** 2).sum(axis=0))
        length = np.where(length < 0.01, 0.01, length)
        pos += (displacement * t / length).T
        # cool temperature
        t -= dt
        # Return the layout
    return dict(zip(G, pos))

We’ll use the alogrithm Force Atlas 2 to deal with the spatialization of our Network Graph. The implementation is here.

In [ ]:
positions = forceatlas2_layout(graph, linlog=False, nohubs=False, iterations=100)
In [ ]:
nx.draw(graph, positions, node_size=1, with_labels=False)
plt.savefig('images/github-analysis.png')
plt.show()

Looks like my Github followers don’t form much of a traditional social network.

In [ ]:
print(nx.info(graph))

Comments !

Subscribe to the mailing list

* indicates required

links

social