本篇文章会针对用户在猫眼上对于「碟中谍6」的评论进行一个可视化分析,我们总共采集了44872条用户评论,文章内容包括:
我这边是使用DataFrame将数据读入内存,代码如下:
import pandas as pd
with open('comment.txt','r') as f:
comment = f.read()
comment_list = comment.split('\n')
print '>>>累计评论数:%s\n'%len(comment_list)
data = []
temp = ['','','','','']
for comment in comment_list:
comment = comment.split('|')
if len(comment) == 1:
temp[4] = comment[0]
comment = temp
data.append(comment)
elif len(comment) != 5:
pass
else:
data.append(comment)
data = pd.DataFrame(data,
columns = ['时间','昵称','城市','评分','内容'])
print data.head()
temp = data[data['评分'] != ''].groupby('评分')['昵称'].count().reset_index()
temp.columns = ['评分','数量']
Pie = pyecharts.Pie('「碟中谍」评分分布','统计时间:2018-9-6')
Pie.add("??", temp['评分'], temp['数量'],
radius=[30, 75], rosetype='radius',
is_legend_show=False, is_label_show=True)
Pie
data['日期'] = data['时间'].str[0:10]
data['小时'] = data['时间'].str[11:13]
temp = data[(data['时间'] >= '2018-08-31 00:00:00')& (data['时间'] <= '2018-09-07 00:00:00')].groupby(['小时','日期'])['昵称'].count().reset_index()
temp.columns = ['小时','日期','数量']
date = ['2018-08-31',
'2018-09-01',
'2018-09-02',
'2018-09-03',
'2018-09-04',
'2018-09-05',
'2018-09-06']
temp['小时'] = temp['小时'].astype('int')
temp['日期'] = temp['日期'].replace({'2018-08-31':0,
'2018-09-01':1,
'2018-09-02':2,
'2018-09-03':3,
'2018-09-04':4,
'2018-09-05':5,
'2018-09-06':6})
temp = temp.values.tolist()
hour = range(24)
HeatMap = pyecharts.HeatMap('评论-时间分布','统计时间:2018-09-06')
HeatMap.add("评论数量", hour, date, temp, is_visualmap=True,visual_range=[0, 700],is_legend_show = False,
visual_text_color="#000", visual_orient='vertical',visual_pos="right")
HeatMap
http://api.map.baidu.com/geocoder?address=位置&output=json&key=申请到的key
就能获取到经纬度信息了。1. 接下来就是写个循环将你的地理信息传过去,获取到信息之后保存下来就ok了,比自己一个个手动添加是不是有效率多了~??
import requests
from tqdm import tqdm
data['城市'][data['城市'] == '伊犁'] = '伊犁哈萨克自治州'
temp = data[data['城市'] != ''].groupby('城市')['昵称'].count().reset_index()
temp.columns = ['城市','数量']
headers = {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"}
key = '申请的KEY '
dic = {}
city_list = list(set(temp['城市']))
for city in tqdm(city_list):
url = 'http://api.map.baidu.com/geocoder/v2/?address=%s&output=json&ak=%s'%(city,key)
response = requests.get(url)
position = response.json()
positin_list = []
if position['status'] == 0:
positin_list.append(position['result']['location']['lng'])
positin_list.append(position['result']['location']['lat'])
dic[city] = positin_list
else:
pass
Geo = pyecharts.Geo("评论城市分布", "来源:Kaggle", title_color="#fff", title_pos="center",
width=800, height=600, background_color='#404a59')
Geo.add("", temp['城市'], temp['数量'], visual_range=[0, 1000], type='heatmap',
visual_text_color="#fff", is_visualmap=True,is_legend_show=False,
geo_cities_coords = dic)
Geo
image.png
from jieba import posseg as psg
import collections
string = ''.join(data['内容'][data['内容'] <> ''])
word_list = []
stop_words = ['就是','这是','但是','虽然','一部','觉得','还是','没有']
words = psg.cut(string)
for x in words:
if x.flag == 'x':
pass
elif len(x.word) == 1:
pass
elif x.word.encode('utf-8') in stop_words:
pass
else:
word_list.append(x.word)
c = collections.Counter(word_list)
attr = []
value = []
for x in c.most_common(10):
attr.append(x[0])
value.append(x[1])
Bar = pyecharts.Bar("评论中出现频率最高的10个词", "统计时间:2018-09-06")
Bar.add("出现次数", attr, value,mark_point=['max'],is_legend_show = False)
Bar
import imageio
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
back_color = imageio.imread('TomCruise.jpg')
words = ' '.join(word_list)
wc = WordCloud(background_color='white',
max_words=5000,
mask=back_color,
max_font_size=200,
font_path="/Users/tangwenpan/Documents/fonts/SimHei.ttf",
random_state=None
)
wc.generate(words)
image_colors = ImageColorGenerator(back_color)
plt.figure(figsize = (15,8))
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis('off')
plt.show()
wc.to_file('comment.png')
skrskr~