#!/usr/bin/python
import feedparser
import re
#
# Count words in rss feed
#
def getwordcounts(url):
d = feedparser.parse(url)
wc = {}
for e in d.entries:
if 'summary' in e: summary=e.summary
else: summary=e.description
words = getwords(e.title + ' ' + summary)
for word in words:
wc.setdefault(word,0)
wc[word] += 1
return d.feed.title,wc
#
# Delete unuseful character and split words
#
def getwords(html):
txt = re.compile(r'<[^>]+>').sub('',html)
words = re.compile(r'[^A-Z^a-z]+').split(txt)
return [word.lower() for word in words if word != '']
if __name__ == '__main__':
apcount = {}
wordcounts = {}
feedcount = 0
feedlist = [line for line in file('feedlist.txt')]
for feedurl in feedlist:
try:
title,wc = getwordcounts(feedurl)
wordcounts[title] = wc
for word,count in wc.items():
apcount.setdefault(word,0)
if count > 1:
apcount[word] += 1
except:
print 'Failed to parse feed %s' % feedurl
feedcount += 1
print 'finish %d/%d' % (feedcount, len(feedlist))
wordlist = []
for w,bc in apcount.items():
frac = float(bc)/len(feedlist)
if frac > 0.1 and frac < 0.5: wordlist.append(w)
out = file('blogdata.txt', 'w')
out.write('Blog')
for word in wordlist: out.write('\t%s' % word)
out.write('\n')
for blog,wc in wordcounts.items():
out.write(blog)
for word in wordlist:
if word in wc: out.write('\t%d' % wc[word])
else: out.write('\t0')
out.write('\n')
#!/usr/bin/python
from math import sqrt
#
# Calculate pearson correlation between p1 and p2
#
def pearson(p1,p2):
# calculate p's variance
sum1 = sum(p1)
sum2 = sum(p2)
sum1Sq = sum([pow(p,2) for p in p1])
sum2Sq = sum([pow(p,2) for p in p2])
pSum = sum([p1[i]*p2[i] for i in range(len(p1))])
num = pSum-(sum1*sum2/len(p1))
den=sqrt((sum1Sq-pow(sum1,2)/len(p1))*(sum2Sq-pow(sum2,2)/len(p1)))
if den == 0: return 0
return 1.0 - num/den
#
# Read Dataset
#
def readfile(filename):
lines=[line for line in file(filename)]
colnames=lines[0].strip().split('\t')[1:]
rownames=[]
data=[]
for line in lines[1:]:
p = line.strip().split('\t')
rownames.append(p[0])
data.append([float(x) for x in p[1:]])
return rownames, colnames, data
#
# Cluster class
#
class bicluster:
def __init__(self, vec, left=None, right=None, distance=0.0, id=None):
self.left = left
self.right = right
self.vec = vec
self.id = id
self.distance = distance
#
# Hierarchical Clustering
#
def hcluster(rows, distance=pearson):
distances = {}
currentclustid = -1
clust = [bicluster(rows[i], id=i) for i in range(len(rows))]
while len(clust) > 1:
lowestpair = (0,1)
closest = distance(clust[0].vec, clust[1].vec)
# Get lowest cluster
for i in range(len(clust)):
for j in range(i+1, len(clust)):
if (clust[i].id, clust[j].id) not in distances:
distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)
d = distances[(clust[i].id, clust[j].id)]
if d < closest:
closest = d
lowstpair = (i,j)
# Calculate average cluster
mergevec = [(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]
# Make new cluster
newcluster = bicluster(mergevec, left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest, id=currentclustid)
currentclustid -= 1
del clust[lowestpair[1]]
del clust[lowestpair[0]]
clust.append(newcluster)
return clust[0]
#
# Print result after hierarchical clustering
#
def printclust(clust, labels=None, n=0):
for i in range(n): print ' ',
if clust.id < 0:
print '-'
else:
if labels == None: print clust.id
else: print labels[clust.id]
if clust.left != None: printclust(clust.left, labels=labels, n=n+1)
if clust.right != None: printclust(clust.right, labels=labels, n=n+1)
if __name__ == '__main__':
blognames,words,data=readfile('blogdata.txt')
clust=hcluster(data)
printclust(clust, labels=blognames)
#!/usr/bin/python
import random
from math import sqrt
#
# Read Dataset
#
def readfile(filename):
lines=[line for line in file(filename)]
colnames=lines[0].strip().split('\t')[1:]
rownames=[]
data=[]
for line in lines[1:]:
p = line.strip().split('\t')
rownames.append(p[0])
data.append([float(x) for x in p[1:]])
return rownames, colnames, data
#
# Calculate pearson correlation between p1 and p2
#
def pearson(p1,p2):
# calculate p's variance
sum1 = sum(p1)
sum2 = sum(p2)
sum1Sq = sum([pow(p,2) for p in p1])
sum2Sq = sum([pow(p,2) for p in p2])
pSum = sum([p1[i]*p2[i] for i in range(len(p1))])
num = pSum-(sum1*sum2/len(p1))
den=sqrt((sum1Sq-pow(sum1,2)/len(p1))*(sum2Sq-pow(sum2,2)/len(p1)))
if den == 0: return 0
return 1.0 - num/den
def kcluster(rows, distance=pearson, k=4):
# Make k clusters with random
ranges=[(min([row[i] for row in rows]), max([row[i] for row in rows])) for i in range(len(rows[0]))]
clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(rows[0]))] for j in range(k)]
lastmatches = None
for t in range(100):
print 'Iteration %d' % t
bestmatches=[[] for i in range(k)]
for j in range(len(rows)):
row = rows[j]
bestmatch = 0
for i in range(k):
d = distance(clusters[i], row)
if d < distance(clusters[bestmatch], row):
bestmatch = i
bestmatches[bestmatch].append(j)
if bestmatches == lastmatches: break
lastmatches = bestmatches
for i in range(k):
avgs = [0.0] * len(rows[0])
if len(bestmatches[i]) > 0:
for rowid in bestmatches[i]:
for m in range(len(rows[rowid])):
avgs[m] += rows[rowid][m]
for j in range(len(avgs)):
avgs[j] /= len(bestmatches[i])
clusters[i] = avgs
return bestmatches
if __name__ == '__main__':
blognames,words,data=readfile('blogdata.txt')
kclust = kcluster(data, k=10)
print kclust
print [blognames[r] for r in kclust[0]]
多次元尺度構成法
#
# Read Dataset
#
def readfile(filename):
lines=[line for line in file(filename)]
colnames=lines[0].strip().split('\t')[1:]
rownames=[]
data=[]
for line in lines[1:]:
p = line.strip().split('\t')
rownames.append(p[0])
data.append([float(x) for x in p[1:]])
return rownames, colnames, data
def scaledown(data,distance=pearson,rate=0.01):
n=len(data)
realdist=[[distance(data[i],data[j]) for j in range(n)] for i in range(0,n)]
outersum = 0.0
loc=[[random.random(),random.random()] for i in range(n)]
fakedist=[[0.0 for j in range(n)] for i in range(n)]
lasterror = None
for m in range(0,1000):
for i in range(n):
for j in range(n):
fakedist[i][j] = sqrt(sum([pow(loc[i][x]-loc[j][x],2) for x in range(len(loc[i]))]))
grad=[[0.0,0.0] for i in range(n)]
totalerror=0
for k in range(n):
for j in range(n):
if j==k: continue
errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k]
grad[k][0] += ((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm
grad[k][1] += ((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm
totalerror += abs(errorterm)
print totalerror
if lasterror and lasterror <totalerror: break
lasterror=totalerror
for k in range(n):
loc[k][0] -= rate*grad[k][0]
loc[k][1] -= rate*grad[k][1]
return loc
def draw2d(data,labels,png="mds2d.png"):
img=Image.new('RGB', (2000,2000), (255,255,255))
draw=ImageDraw.Draw(img)
for i in range(len(data)):
x=(data[i][0]+0.5)*1000
y=(data[i][1]+0.5)*1000
draw.text((x,y),labels[i],(0,0,0))
img.save(png, 'PNG')
if __name__ == '__main__':
blognames,words,data=readfile('blogdata.txt')
coords=clusters.scaledown(data)
clusters.draw2d(coords,blognames,png='blogs2d.png')