In [168]:
import urllib2

search_address = "http://lernpython.de/21-machine-learning-pythonmodule-auf-github"
response = urllib2.urlopen(search_address)
html = response.read()

html2 = html.split('"')

htmllist2=[]
for i in html2:
    if i.startswith("https://github.com"):
        htmllist2.append(i)

for i in htmllist2:
    print i
https://github.com/scikit-learn/scikit-learn
https://github.com/numenta/nupic
https://github.com/machinalis/iepy
https://github.com/machinalis/quepy
https://github.com/machinalis/featureforge
https://github.com/jaberg/skdata
https://github.com/rasbt/mlxtend
https://github.com/awslabs/machine-learning-samples
https://github.com/yandex/rep
https://github.com/dclambert/Python-ELM
https://github.com/mila-udem/fuel
https://github.com/nilearn/nilearn
https://github.com/idiap/bob
https://github.com/pybrain/pybrain
https://github.com/clips/pattern
https://github.com/lisa-lab/pylearn2
https://github.com/luispedro/milk
https://github.com/numenta/nupic
https://github.com/kvh/ramp
https://github.com/dnouri/nolearn
https://github.com/Theano/Theano

In [170]:
x1 = []
x2 = []
x3 = []
for i in htmllist2:
    search_address = i
    response = urllib2.urlopen(search_address)
    name = i.split('/')[-1]
    
    html = response.read()#3
    com = html.split('      <li class="commits">')[-1].split('            commits')[0]
    com = com.split('<span class="num text-emphasized">')[-1].split('</span>')[0]
    com = com.strip()
    try: 
        com =  int(com.replace(',', ''))
        
    except ValueError:
        #com = 10.1
        pass
    
    con = html.split('         releases')[-1].split('    contributors')[0]
    con = con.split('<span class="num text-emphasized">')[-1].split('</span>')[0]
    con = con.strip()
    try:
        con =  int(con.replace(',', ''))
        x2.append(con)
        x3.append(name)
        x1.append(com)
    except ValueError:
        pass 
    
print x1, x2, x3
[18911, 4431, 1760, 131, 219, 441, 147, 15, 50, 17, 5080, 969, 943, 7042, 4431, 179, 205, 18576] [409, 61, 10, 9, 3, 10, 5, 3, 3, 1, 11, 27, 20, 117, 61, 4, 4, 141] ['scikit-learn', 'nupic', 'iepy', 'quepy', 'featureforge', 'skdata', 'mlxtend', 'machine-learning-samples', 'rep', 'Python-ELM', 'bob', 'pybrain', 'pattern', 'pylearn2', 'nupic', 'ramp', 'nolearn', 'Theano']

In [295]:
import pandas as pd

print len(x3), len(x2), len(x1)

size2=[]
for i in range(len(x2)):
    size2.append(x1[i]/ x2[i] )
d = {'Contributors' : x1,
     'Commits' : x2,
     'Module name' : x3,
     'Ratio Contr/Commits' : size}
        
df = pd.DataFrame(d)
%matplotlib inline 
df.plot(logy=True, logx=True , kind='scatter',y='Contributors', x='Commits');
df
18 18 18

Out[295]:
Commits Contributors Module name Ratio Contr/Commits
0 409 18911 scikit-learn 46.237164
1 61 4431 nupic 72.639344
2 10 1760 iepy 176.000000
3 9 131 quepy 14.555556
4 3 219 featureforge 73.000000
5 10 441 skdata 44.100000
6 5 147 mlxtend 29.400000
7 3 15 machine-learning-samples 5.000000
8 3 50 rep 16.666667
9 1 17 Python-ELM 17.000000
10 11 5080 bob 461.818182
11 27 969 pybrain 35.888889
12 20 943 pattern 47.150000
13 117 7042 pylearn2 60.188034
14 61 4431 nupic 72.639344
15 4 179 ramp 44.750000
16 4 205 nolearn 51.250000
17 141 18576 Theano 131.744681
In [294]:
import matplotlib.pyplot as plt
from __future__ import division
print x1, x2, size2, x3

fig, ax = plt.subplots()
ax.set_yscale('log')
ax.set_xscale('log')
ax.scatter(x2,x1,  s = size2)
ax.set_xlabel("Contributors",fontsize=12)
ax.set_ylabel("Commits",fontsize=12)

for i, txt in enumerate( x3 ):
    ax.annotate( txt, ( x2[i] + 0.1, x1[i] + 0.1 ) )

plt.grid()
plt.show()
[18911, 4431, 1760, 131, 219, 441, 147, 15, 50, 17, 5080, 969, 943, 7042, 4431, 179, 205, 18576] [409, 61, 10, 9, 3, 10, 5, 3, 3, 1, 11, 27, 20, 117, 61, 4, 4, 141] [46.23716381418093, 72.63934426229508, 176.0, 14.555555555555555, 73.0, 44.1, 29.4, 5.0, 16.666666666666668, 17.0, 461.8181818181818, 35.888888888888886, 47.15, 60.18803418803419, 72.63934426229508, 44.75, 51.25, 131.74468085106383] ['scikit-learn', 'nupic', 'iepy', 'quepy', 'featureforge', 'skdata', 'mlxtend', 'machine-learning-samples', 'rep', 'Python-ELM', 'bob', 'pybrain', 'pattern', 'pylearn2', 'nupic', 'ramp', 'nolearn', 'Theano']

In [293]:
import seaborn as sns
from scipy import stats

#tips = sns.load_dataset('df')
#ax = plt.subplots(figsize=(7, 7))
#ax.set(xscale="log", yscale="log")

f, ax = plt.subplots(figsize=(7, 7))
ax.set(xscale="log", yscale="log")
for i, txt in enumerate( x3 ):
    ax.annotate( txt, ( x2[i] + 0.1, x1[i] + 0.1 ) )
sns.regplot("Commits", "Contributors", df, ax=ax, scatter_kws={"s": 50}, robust=True)
Out[293]:
<matplotlib.axes._subplots.AxesSubplot at 0xf2c4f98>
In [266]:
g = sns.JointGrid("Commits", "Contributors", data= df)
g.plot(sns.regplot, sns.distplot, stats.pearsonr);
In [283]:
from scipy import stats
import numpy as np
x1
x2
#y = df[1]
slope, intercept, r_value, p_value, std_err = stats.linregress(x1,x2)
print "r-squared:", r_value**2, p_value
r-squared: 0.761705948236 2.30055656173e-06