Visualizing Youtube Ego Network Using Networkx

Published: Wed 12 July 2017

In Random.

Let’s install some stuff

In [ ]:
!pip install google-api-python-client networkx scipy numpy matplotlib

Now let’s import a bunch of stuff (most of it is to deal with oauth2) and have the graphs be displayed below when we print them.

In [ ]:
import os
import tempfile

import numpy as np
import scipy

from argparse import Namespace

import httplib2
from apiclient.discovery import build

from oauth2client.file import Storage
from oauth2client.tools import run_flow
from oauth2client.client import OAuth2WebServerFlow

import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

Google’s API is stringly typed. And it’s totally cool.

I’m totally not irritated that these aren’t enumerated in your python library.

This is my not irritated voice.

In [ ]:
YOUTUBE_READONLY_SCOPE = "https://www.googleapis.com/auth/youtube.readonly"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

Google wants to store the response/counter response in a file so you don’t have to reauthenticate. Which I get. But it wants to drop random files in your system, which I don’t like. So we’re going to hack around this by dropping them in the temporary directory, so that they’ll be cleaned up on reboot. Technically using this API, I should manually delete them.

In [ ]:
filename = tempfile.mkstemp()
os.close(filename[0])
filename = filename[1]

You’ll need an OAuth2 credentials which you can get from here following these instructions. The service object, built using the method build (imported from apiclient.discovery) is the only object we’ll be using for the rest. Everything else is just setup.

In [ ]:
client_id = '408396439838-o291hou2dra7l4i03vcg6ha6k7u8qk9b.apps.googleusercontent.com'
# Just use a normal string. I'm doing this so I don't leak credentials.
client_secret = os.getenv('YOUTUBE_CLIENT_SECRET')

kwargs = {"auth_uri":"https://accounts.google.com/o/oauth2/auth",
          "token_uri":"https://accounts.google.com/o/oauth2/token",
          "auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs",
          "redirect_uris":["urn:ietf:wg:oauth:2.0:oob", "http://localhost"]}

flow = OAuth2WebServerFlow(client_id, client_secret, YOUTUBE_READONLY_SCOPE, **kwargs)

storage = Storage(filename)
credentials = storage.get()

args = Namespace(auth_host_name='localhost',
                 auth_host_port=[8080,8090],
                 noauth_local_webserver=False,
                 logging_level='ERROR')

if credentials is None or credentials.invalid:
    credentials = run_flow(flow, storage, args)

service = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                http=credentials.authorize(httplib2.Http()))

Graph setup!

In [ ]:
graph = nx.Graph()

Get every single subscriber and add the subscriber’s name as a node, as well as capturing the connection. Store their channel id and name in the stored_channel_ids to get everyone that they are following.

In [ ]:
kwargs ={'part': 'subscriberSnippet', 'mySubscribers': 'true', 'maxResults': '50'}

# Get the snippet so we can get the publishedAt to do contagion analysis
# kwargs ={'part': 'subscriberSnippet,snippet', 'mySubscribers': 'true', 'maxResults': '50'}

has_next_page = True
stored_channel_ids = {}

# NOTE: change this to be your channel name.
channel_name = 'benhoff'
graph.add_node(channel_name)

while has_next_page:
    results = service.subscriptions().list(**kwargs).execute()
    has_next_page = results.get('nextPageToken')
    if has_next_page is not None:
        kwargs['pageToken'] = has_next_page
        
    for result in results['items']:
        name = result['subscriberSnippet']['title']
        channel_id = result['subscriberSnippet']['channelId']
        stored_channel_ids[channel_id] = [name,]
        # if ever interested in doing contagion type analysis
        # result['snippet]['publishedAt']
        graph.add_node(name)
        graph.add_edge(name, channel_name)
        

Now, iterate through the stored_channel_ids to get every channel that our followers are following. Add those to the graph.

In [ ]:
kwargs ={'part': 'snippet', 'channelId': '', 'maxResults': '50'}
while stored_channel_ids:
    channel_ids = tuple(stored_channel_ids.keys())
    
    for channel_id in channel_ids:
        # update the query to include this channel id
        kwargs['channelId'] = channel_id
        # update the page token, if we have one
        try:
            page_token = stored_channel_ids[channel_id][1]
            kwargs['pageToken'] = page_token
        # pop the `pageToken` out of kwargs, if we don't have one
        except IndexError:
            try:
                kwargs.pop('pageToken')
            except KeyError:
                pass
            
        results = service.subscriptions().list(**kwargs).execute()
        
        if results.get('error'):
            stored_channel_ids.pop(channel_id)
            continue
            
        name = stored_channel_ids[channel_id][0]
        for result in results['items']:
            follower_name = result['snippet']['title']
            graph.add_node(follower_name)
            graph.add_edge(name, follower_name)
            
        has_token = results.get('nextPageToken')

        if not has_token:
            # remove this from the dict
            stored_channel_ids.pop(channel_id)
        else:
            try:
                stored_channel_ids[channel_id][1] = has_token
            except IndexError:
                stored_channel_ids[channel_id].append(has_token)
In [ ]:
#ignore
# https://github.com/tpoisot/nxfa2
def forceatlas2_layout(G, iterations=10, linlog=False, pos=None, nohubs=False,
                       kr=0.001, k=None, dim=2):
    """
    Options values are
    g                The graph to layout
    iterations       Number of iterations to do
    linlog           Whether to use linear or log repulsion
    random_init      Start with a random position
                     If false, start with FR
    avoidoverlap     Whether to avoid overlap of points
    degreebased      Degree based repulsion
    """
    # We add attributes to store the current and previous convergence speed
    for n in G:
        G.node[n]['prevcs'] = 0
        G.node[n]['currcs'] = 0
        # To numpy matrix
    # This comes from the spares FR layout in nx
    A = nx.to_scipy_sparse_matrix(G, dtype='f')
    nnodes, _ = A.shape

    try:
        A = A.tolil()
    except Exception as e:
        A = (coo_matrix(A)).tolil()
    if pos is None:
        pos = np.asarray(np.random.random((nnodes, dim)), dtype=A.dtype)
    else:
        pos = pos.astype(A.dtype)
    if k is None:
        k = np.sqrt(1.0 / nnodes)
        # Iterations
    # the initial "temperature" is about .1 of domain area (=1x1)
    # this is the largest step allowed in the dynamics.
    t = 0.1
    # simple cooling scheme.
    # linearly step down by dt on each iteration so last iteration is size dt.
    dt = t / float(iterations + 1)
    displacement = np.zeros((dim, nnodes))
    for iteration in range(iterations):
        displacement *= 0
        # loop over rows
        for i in range(A.shape[0]):
            # difference between this row's node position and all others
            delta = (pos[i] - pos).T
            # distance between points
            distance = np.sqrt((delta ** 2).sum(axis=0))
            # enforce minimum distance of 0.01
            distance = np.where(distance < 0.01, 0.01, distance)
            # the adjacency matrix row
            Ai = np.asarray(A.getrowview(i).toarray())
            # displacement "force"
            Dist = k * k / distance ** 2
            if nohubs:
                Dist = Dist / float(Ai.sum(axis=1) + 1)
            if linlog:
                Dist = np.log(Dist + 1)
            displacement[:, i] += \
                (delta * (Dist - Ai * distance / k)).sum(axis=1)
            # update positions
        length = np.sqrt((displacement ** 2).sum(axis=0))
        length = np.where(length < 0.01, 0.01, length)
        pos += (displacement * t / length).T
        # cool temperature
        t -= dt
        # Return the layout
    return dict(zip(G, pos))

We need to position all our nodes and edges so that we can make some sense of it. So we’ll use either the forceatlas2 method, which you can see here.

In [ ]:
positions = forceatlas2_layout(graph, linlog=False, nohubs=False, iterations=100)
In [ ]:
#ignore
positions = nx.spring_layout(graph)

Draw the graph

In [ ]:
nx.draw(graph, positions, node_size=1, with_labels=False)
plt.savefig('images/youtube-analysis.png')
plt.show()
In [ ]:
print(nx.info(graph))

Comments !

Subscribe to the mailing list

* indicates required

links

social