数学万
第一步是数据抓取(py2.7完成的):
import urllib2
import re
import os
body =urllib2.urlopen("http://www.9ku.com/geshou/722.htm")
str= body.read()
pos =str.find('826828@')#查找字符串,第一次出现的位置
pos1=str.find("485.")#最后一首歌的位置
vals= str[pos:pos1]
arr=vals.split('geci/')
k=0
data_arr=[]
for data in arr:
end=data.find(".htm")
data_arr.append(data[0:end] )
k=k+1
#print data_arr
#
#
url="http://www.9ku.com/geci/"
def readMsg(data):#读取数据并清洗
dr = re.compile(r']+>',re.S)
clsStr = dr.sub('',data)
clsStr= re.sub("[A-Za-z0-9\[\`\u3000\~\:\!\@\#\$\^\&\*\(\)\=\|\{\}\'\r\n\:\;\'\,\[\]\.\\/\?\~\!\@\#\\\&\*\%]",'', clsStr)
#print(clsStr)
clsStr=clsStr.replace(' ','')
clsStr=clsStr.replace(':','')#这里可以加上要过滤的词
f.close()
return clsStr
def substrSN( strs,str1,str2 ):
strw=strs+""
pos3=strw.index(str1)
pos2=strw.index(str2)
return strw[pos3:pos2]
for msg in data_arr:
filename = './data.txt'
bodyData =urllib2.urlopen(url+msg+'.htm')
bodyStr =bodyData.read()
dataStr=substrSN(bodyStr,'
','
')
title=substrSN(dataStr,'','')
contents=dataStr.split('
');
val=''
i=0
lens=len(contents)
for msgStr in contents:
if i>0 and i
val=val+msgStr+','
i=i+1
with open(filename,'a+') as f:
val=readMsg(val)
f.write(val)
f.close()
#f.write(title+"\n"+val)
print '数据获取完成'
第二步分词并统计:
import jieba #分词库
from wordcloud import WordCloud #词云库
import jieba.posseg as pseg
import jieba.analyse
import matplotlib.pyplot as plt #数学绘图库
import re
import os
fp=open('data.txt','w',encoding='utf8')
fp.write('');
fp.close()
def readMsg(filename):#读取数据并清洗
f=open(filename,'rb')
f_read=f.read().decode('utf-8')
#print(f_read)
dr = re.compile(r']+>',re.S)
clsStr = dr.sub('',f_read)
clsStr= re.sub("[A-Za-z0-9\[\`\u3000\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\r\n\:\;\'\,\[\]\.\\/\?\~\!\@\#\\\&\*\%]",'', clsStr)
#print(clsStr)
clsStr=clsStr.replace(' ','')
clsStr=clsStr.replace(':','')#这里可以加上要过滤的词
f.close()
return clsStr
def writeData(strs):
fp=open('data.txt','a+',encoding='utf8')
fp.write(strs);
fp.close()
def readMsg(filename):#读取数据并清洗
f=open(filename,'rb')
f_read=f.read().decode('utf-8')
#print(f_read)
dr = re.compile(r']+>',re.S)
clsStr = dr.sub('',f_read)
clsStr= re.sub("[A-Za-z0-9\[\`\u3000\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\r\n\:\;\'\,\[\]\.\\/\?\~\!\@\#\\\&\*\%]",'', clsStr)
#print(clsStr)
clsStr=clsStr.replace(' ','')
clsStr=clsStr.replace(':','')#这里可以加上要过滤的词
f.close()
return clsStr
rootdir = './txt'
list = os.listdir(rootdir) #列出文件夹下所有的目录与文件
for i in range(0,len(list)):
content=readMsg(path)
writeData(content)
text = readMsg(r'data.txt')
cut_text= jieba.cut(text)
result= "/".join(cut_text)
wc = WordCloud(font_path=r"simhei.ttf",background_color='white',width=800,height=600,max_font_size=50,max_words=10000,min_font_size=10,mode='RGBA',colormap='pink')
wc.generate(result)
wc.to_file(r"bg.png")
plt.figure("词云图") #指定所绘图名称
plt.imshow(wc) # 以图片的形式显示词云
plt.axis("off") #关闭图像坐标系
plt.show()
领取专属 10元无门槛券
私享最新 技术干货