#遍历文件中每一行,生成针对每个博客的单词统计,以及出现这些单词的博客数目apcount apcount={} wordcounts={} feedlist=[line for line in open('feedlist.txt')] for feedurl in feedlist: title,wc=getwordcounts(feedurl) wordcounts[title]=wc for word,count in wc.items(): apcount.setdefault(word,0) if count>1: apcount[word]+=1
#只保留出现频率在每个百分比范围内的单词,减少需要考察的单词总量,我们认为这些单词是有特色的 wordlist=[] for w,bc in apcount.items(): frac=float(bc)/len(feedlist) if frac>0.1and frac<0.5: wordlist.append(w)
#生成文件,并输出一个大矩阵 out=open('blogdata.txt','w') out.write('Blog') for word in wordlist: out.write('\t%s' % word) out.write('\n') for blog,wc in wordcounts.items(): out.write(blog) for word in wordlist: if word in wc: out.write('\t%s' % wc[word]) else: out.write('\t0') out.write('\n')
defreadfile(filename): #逐行读取,这里示例代码有误,已更正 lines=[line for line in open(filename)]
#第一行是列标题 colnames=lines[0].strip().split('\t')[1:] rownames=[] data=[] for line in lines[1:]: p=line.strip().split('\t') #每行的第一列是行名 rownames.append(p[0]) #剩余部分就是该行对应的数据 data.append([float(x) for x in p[1:]]) #给最后加一个list return rownames,colnames,data
#最初的聚类是数据集中的行 clust=[bicluster(rows[i],id=i) for i in range(len(rows))]
while len(clust)>1: lowestpair=(0,1) closest=distance(clust[0].vec,clust[1].vec)
#遍历每一个配对,寻找最小值 #用distances来缓存距离的计算值 for i in range(len(clust)): for j in range(i+1,len(clust)): if (clust[i].id,clust[j].id) notin distances: distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)
d=distances[(clust[i].id,clust[j].id)] if d<closest: closest=d lowestpair=(i,j)
#计算两个聚类的平均值 mergevec=[ (clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec)) ]
#不在原始集合中的聚类,id为负数? currentclustid-=1 del clust[lowestpair[0]] del clust[lowestpair[1]] clust.append(newcluster)
return clust[0]
最最最简单的可视化
其实就是递归遍历聚类树,以类似文件系统结构的形式把它打印出来啦。
1 2 3 4 5 6 7 8 9 10 11 12 13 14
defprintclust(clust,labels=None,n=0): #利用缩进来建立层级布局 for i in range(n): print(' ') if clust.id<0: #负数标记 - 分支 print('-') else: #正数标记 - 叶节点 if labels==None:print(clust.id) else: print(labels[clust.id])
if clust.left!=None: printclust(clust.left,labels=labels,n=n+1) if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)
#矩阵转置 defrotatematrix(data): newdata=[] for i in range(len(data[0])): newrow=[data[j][i] for j in range(len(data))] newdata.append(newrow) return newdata
defkcluster(rows,distance=pearson,k=4): #确定每个点的最小值和最大值? ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) for i in range(len(rows[0]))]
#随机创立K个中心点 clusters=[ [random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(rows[0]))]for j in range (k) ]
lastmatches = None for t in range(100): print('Iteration %d'%t) #第t次迭代 bestmatches=[[]for i in range(k)]
#在每一行中寻找最近的中心点 for j in range(len(rows)): row=rows[j] bestmatch=0#初始假设第一个中心点距离最近 for i in range(k): d=distance(clusters[i],row) if d<distance(clusters[bestmatch],row):bestmatch=i bestmatches[bestmatch].append(j)
#如果结果与上一次相同,迭代结束 if bestmatches==lastmatches:break lastmatches=bestmatches
#把中心点移到其所有成员的平均位置处 for i in range(k): avgs=[0.0]*len(rows[0]) if len(bestmatches[i])>0: for rowid in bestmatches[i]: for m in range(len(rows[rowid])): avgs[m]+=rows[rowid][m] for j in range(len(avgs)): avgs[j]/=len(bestmatches[i]) clusters[i]=avgs
return bestmatches
1 2
kclust=clusters.kcluster(data,k=10) [blognames[r] for r in kclust[0]]
输出
[‘Unknown title’, ‘NB Blog Feed’, ‘ThinkProgress’, ‘TechEBlog’, ‘TMZ.com’]
#每一对数据项之间的距离 realdist=[ [distance(data[i],data[j]) for j in range(n)] for i in range(0,n) ] outersum=0.0
#随机初始化节点在二维空间中的起始位置 loc=[[random.random(),random.random()] for i in range(n)] fakedist=[[0.0for j in range(n)] for i in range(n)]
lasterror=None for m in range(0,1000): #寻找投影后的距离 for i in range(n): for j in range(n): fakedist[i][j]=sqrt( sum([pow(loc[i][x]-loc[j][x],2) for x in range(len(loc[i])) ]))
#移动节点 grad=[[0.0,0.0] for i in range(n)]
totalerror = 0 for k in range(n): for j in range(n): if j==k: continue #误差值等于目标距离与当前距离之间差值的百分比 errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k]
#如果节点移动之后误差更大,结束 if lasterror and lasterror<totalerror:break lasterror=totalerror
#根据rate与grad移动每一个节点 for k in range(n): loc[k][0]-=rate*grad[k][0] loc[k][1]-=rate*grad[k][1]
return loc
可视化-生成二维图
1 2 3 4 5 6 7 8
defdraw2d(data,labels,jpeg='mds2d.jpg'): img=Image.new('RGB',(2000,2000),(255,255,255)) draw=ImageDraw.Draw(img) for i in range(len(data)): x=(data[i][0]+0.5)*1000 y=(data[i][1]+0.5)*1000 draw.text((x,y),labels[i],(0,0,0)) img.save(jpeg,'JPEG')
1 2 3 4 5
from imp import reload reload(clusters) blognames,words,data=clusters.readfile('blogdata.txt') coords=clusters.scaledown(data) clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg')
The URL you are trying to parse with feedparser is either not a valid feed (check it with feedvalidator), but a web page, or the feed is empty, or the title is empty.