import urllib2
#import re
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
link0 = "https://github.com/scikit-learn/scikit-learn"
link1 = "https://github.com/numenta/nupic"
link2 = "https://github.com/machinalis/iepy"
link3 = "https://github.com/machinalis/quepy"
link4 = "https://github.com/machinalis/featureforge"
link5 = "https://github.com/jaberg/skdata"
link6 = "https://github.com/rasbt/mlxtend"
link7 = "https://github.com/awslabs/machine-learning-samples"
link8 = "https://github.com/yandex/rep"
link9 = "https://github.com/dclambert/Python-ELM"
link10 = "https://github.com/mila-udem/fuel"
link11 = "https://github.com/nilearn/nilearn"
link12 = "https://github.com/idiap/bob"
link13 = "https://github.com/pybrain/pybrain"
link14 = "https://github.com/clips/pattern"
link15 = "https://github.com/lisa-lab/pylearn2"
link16 = "https://github.com/luispedro/milk"
link17 = "https://github.com/numenta/nupic"
link18 = "https://github.com/kvh/ramp"
link19 = "https://github.com/dnouri/nolearn"
#htmllist = [ "link"+str(i) for i in range(2) ]
#print htmllist
htmllist2 = [link0, link1, link2, link3, link4, link5, link6, link7,
link8, link9, link10, link11, link12, link13, link14, link14, link15, link16, link17, link18, link19]
#print htmllist , htmllist2
s=set(stopwords.words('english')) # list of words, which have no special meaning
n = 0
doclist = [] #list, which will be filled with title, abstract and claims from found patents
#print htmllist2
x1 = []
x2= []
for i in htmllist2:
search_address = i
response = urllib2.urlopen(search_address)
html = response.read()#3
#print html
#finds title text between the characters FONT size... and FONT
com = html.split(' <li class="commits">')[-1].split(' commits')[0]
com = com.split('<span class="num text-emphasized">')[-1].split('</span>')[0]
com = com.strip()
com = int(com.replace(',', ''))
con = html.split(' releases')[-1].split(' contributors')[0]
con = con.split('<span class="num text-emphasized">')[-1].split('</span>')[0]
con = con.strip()
try:
con = int(con.replace(',', ''))
except ValueError:
con = 0
#con = int(con.replace(',', ''))
#print link1, "Contributors", con #, len(con.strip())
x1.append(com)
x2.append(con)
print x1 ,x2
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from __future__ import division
plt.loglog(x2,x1, 'ro', markersize=15)