In [3]:
import urllib2
#import re
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

link0 = "https://github.com/scikit-learn/scikit-learn"
link1 = "https://github.com/numenta/nupic"
link2 = "https://github.com/machinalis/iepy"
link3 = "https://github.com/machinalis/quepy"
link4 = "https://github.com/machinalis/featureforge"
link5 = "https://github.com/jaberg/skdata"
link6 = "https://github.com/rasbt/mlxtend"
link7 = "https://github.com/awslabs/machine-learning-samples"
link8 = "https://github.com/yandex/rep"
link9 = "https://github.com/dclambert/Python-ELM"
link10 = "https://github.com/mila-udem/fuel"
link11 = "https://github.com/nilearn/nilearn"
link12 = "https://github.com/idiap/bob"
link13 = "https://github.com/pybrain/pybrain"
link14 = "https://github.com/clips/pattern"
link15 = "https://github.com/lisa-lab/pylearn2"
link16 = "https://github.com/luispedro/milk"
link17 = "https://github.com/numenta/nupic"
link18 = "https://github.com/kvh/ramp"
link19 = "https://github.com/dnouri/nolearn"

#htmllist  = [ "link"+str(i) for i in range(2) ]
#print htmllist


htmllist2 = [link0, link1, link2, link3, link4, link5, link6, link7, 
             link8, link9, link10, link11, link12, link13, link14, link14, link15, link16, link17, link18, link19]
#print htmllist , htmllist2

s=set(stopwords.words('english')) # list of words, which have no special meaning
n = 0 
doclist = [] #list, which will be filled with title, abstract and claims from found patents

#print htmllist2
x1 = []
x2= []
for i in htmllist2:
    search_address = i
    response = urllib2.urlopen(search_address)
    html = response.read()#3
    #print html
    #finds title text between the characters FONT size... and FONT
    com = html.split('      <li class="commits">')[-1].split('            commits')[0]
    com = com.split('<span class="num text-emphasized">')[-1].split('</span>')[0]
    com = com.strip()
    com = int(com.replace(',', ''))
    con = html.split('         releases')[-1].split('    contributors')[0]
    con = con.split('<span class="num text-emphasized">')[-1].split('</span>')[0]
    con = con.strip()
    try:
        con =  int(con.replace(',', ''))
    except ValueError:
        con = 0
    #con = int(con.replace(',', ''))
    #print link1, "Contributors", con #, len(con.strip())
    x1.append(com) 
    x2.append(con)
print x1 ,x2
[18861, 4392, 1760, 131, 219, 441, 135, 15, 50, 17, 497, 2742, 5080, 969, 943, 943, 7030, 687, 4392, 179, 204] [404, 60, 10, 9, 3, 10, 5, 3, 0, 1, 12, 0, 11, 27, 0, 0, 117, 9, 60, 4, 4]

In [7]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from __future__ import division

plt.loglog(x2,x1, 'ro', markersize=15)
Out[7]:
[<matplotlib.lines.Line2D at 0x14d0d6a0>]