import urllib2

search_address = "http://lernpython.de/21-machine-learning-pythonmodule-auf-github"
response = urllib2.urlopen(search_address)
html = response.read()

html2 = html.split('"')

htmllist2=[]
for i in html2:
    if i.startswith("https://github.com"):
        htmllist2.append(i)

for i in htmllist2:
    print i

https://github.com/scikit-learn/scikit-learn
https://github.com/numenta/nupic
https://github.com/machinalis/iepy
https://github.com/machinalis/quepy
https://github.com/machinalis/featureforge
https://github.com/jaberg/skdata
https://github.com/rasbt/mlxtend
https://github.com/awslabs/machine-learning-samples
https://github.com/yandex/rep
https://github.com/dclambert/Python-ELM
https://github.com/mila-udem/fuel
https://github.com/nilearn/nilearn
https://github.com/idiap/bob
https://github.com/pybrain/pybrain
https://github.com/clips/pattern
https://github.com/lisa-lab/pylearn2
https://github.com/luispedro/milk
https://github.com/numenta/nupic
https://github.com/kvh/ramp
https://github.com/dnouri/nolearn
https://github.com/Theano/Theano

x1 = []
x2 = []
x3 = []
for i in htmllist2:
    search_address = i
    response = urllib2.urlopen(search_address)
    name = i.split('/')[-1]
    
    html = response.read()#3
    com = html.split('      <li class="commits">')[-1].split('            commits')[0]
    com = com.split('<span class="num text-emphasized">')[-1].split('</span>')[0]
    com = com.strip()
    try: 
        com =  int(com.replace(',', ''))
        
    except ValueError:
        #com = 10.1
        pass
    
    con = html.split('         releases')[-1].split('    contributors')[0]
    con = con.split('<span class="num text-emphasized">')[-1].split('</span>')[0]
    con = con.strip()
    try:
        con =  int(con.replace(',', ''))
        x2.append(con)
        x3.append(name)
        x1.append(com)
    except ValueError:
        pass 
    
print x1, x2, x3

[18911, 4431, 1760, 131, 219, 441, 147, 15, 50, 17, 5080, 969, 943, 7042, 4431, 179, 205, 18576] [409, 61, 10, 9, 3, 10, 5, 3, 3, 1, 11, 27, 20, 117, 61, 4, 4, 141] ['scikit-learn', 'nupic', 'iepy', 'quepy', 'featureforge', 'skdata', 'mlxtend', 'machine-learning-samples', 'rep', 'Python-ELM', 'bob', 'pybrain', 'pattern', 'pylearn2', 'nupic', 'ramp', 'nolearn', 'Theano']

import pandas as pd

print len(x3), len(x2), len(x1)

size2=[]
for i in range(len(x2)):
    size2.append(x1[i]/ x2[i] )
d = {'Contributors' : x1,
     'Commits' : x2,
     'Module name' : x3,
     'Ratio Contr/Commits' : size}
        
df = pd.DataFrame(d)
%matplotlib inline 
df.plot(logy=True, logx=True , kind='scatter',y='Contributors', x='Commits');
df

18 18 18

import matplotlib.pyplot as plt
from __future__ import division
print x1, x2, size2, x3

fig, ax = plt.subplots()
ax.set_yscale('log')
ax.set_xscale('log')
ax.scatter(x2,x1,  s = size2)
ax.set_xlabel("Contributors",fontsize=12)
ax.set_ylabel("Commits",fontsize=12)

for i, txt in enumerate( x3 ):
    ax.annotate( txt, ( x2[i] + 0.1, x1[i] + 0.1 ) )

plt.grid()
plt.show()

[18911, 4431, 1760, 131, 219, 441, 147, 15, 50, 17, 5080, 969, 943, 7042, 4431, 179, 205, 18576] [409, 61, 10, 9, 3, 10, 5, 3, 3, 1, 11, 27, 20, 117, 61, 4, 4, 141] [46.23716381418093, 72.63934426229508, 176.0, 14.555555555555555, 73.0, 44.1, 29.4, 5.0, 16.666666666666668, 17.0, 461.8181818181818, 35.888888888888886, 47.15, 60.18803418803419, 72.63934426229508, 44.75, 51.25, 131.74468085106383] ['scikit-learn', 'nupic', 'iepy', 'quepy', 'featureforge', 'skdata', 'mlxtend', 'machine-learning-samples', 'rep', 'Python-ELM', 'bob', 'pybrain', 'pattern', 'pylearn2', 'nupic', 'ramp', 'nolearn', 'Theano']

import seaborn as sns
from scipy import stats

#tips = sns.load_dataset('df')
#ax = plt.subplots(figsize=(7, 7))
#ax.set(xscale="log", yscale="log")

f, ax = plt.subplots(figsize=(7, 7))
ax.set(xscale="log", yscale="log")
for i, txt in enumerate( x3 ):
    ax.annotate( txt, ( x2[i] + 0.1, x1[i] + 0.1 ) )
sns.regplot("Commits", "Contributors", df, ax=ax, scatter_kws={"s": 50}, robust=True)

<matplotlib.axes._subplots.AxesSubplot at 0xf2c4f98>

g = sns.JointGrid("Commits", "Contributors", data= df)
g.plot(sns.regplot, sns.distplot, stats.pearsonr);

from scipy import stats
import numpy as np
x1
x2
#y = df[1]
slope, intercept, r_value, p_value, std_err = stats.linregress(x1,x2)
print "r-squared:", r_value**2, p_value

r-squared: 0.761705948236 2.30055656173e-06

	Commits	Contributors	Module name	Ratio Contr/Commits
0	409	18911	scikit-learn	46.237164
1	61	4431	nupic	72.639344
2	10	1760	iepy	176.000000
3	9	131	quepy	14.555556
4	3	219	featureforge	73.000000
5	10	441	skdata	44.100000
6	5	147	mlxtend	29.400000
7	3	15	machine-learning-samples	5.000000
8	3	50	rep	16.666667
9	1	17	Python-ELM	17.000000
10	11	5080	bob	461.818182
11	27	969	pybrain	35.888889
12	20	943	pattern	47.150000
13	117	7042	pylearn2	60.188034
14	61	4431	nupic	72.639344
15	4	179	ramp	44.750000
16	4	205	nolearn	51.250000
17	141	18576	Theano	131.744681