github源代码:https://github.com/ronghuaxu/zhanlang2_wordcloud
以下我按照代码来详细描述自己在编写爬虫代码时碰到的问题:
python爬虫爬取短评核心代码
1.分页爬取任务主函数
# -*- coding:utf-8 -*-import randomimport timefrom downloader import download as ddfrom parser import movieparser as psimport codecs# 分页爬取任务if __name__ == '__main__': templateurl = 'https://movie.douban.com/subject/26363254/comments?start={}&limit=20&sort=new_score&status=P'; with codecs.open('pjl_comment.txt', 'a', encoding='utf-8') as f: # 4249 for i in range(4249): print ('开始爬取{}页评论...', i) targeturl = templateurl.format(i * 20) res = dd.download_page(targeturl) f.writelines(ps.get_douban_comments(res)) time.sleep(1 + float(random.randint(1, 20)) / 20)
2.爬取的网页中解析出评论的文本数据的函数
# -*- coding:utf-8 -*-from bs4 import BeautifulSoupdef get_douban_comments(res): comments_list = [] # 评论列表 soup = BeautifulSoup(res) comment_nodes = soup.select('.comment > p') for node in comment_nodes: comments_list.append(node.get_text().strip().replace(" ", "") + u' ') return comments_list
3.下载网页源代码函数
# -*- coding:utf-8 -*-import requests# 下载源代码def download_page(url): header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0', 'Cookie':'input your login cookie' } html=requests.get(url,headers=header).content return html
提醒:爬取短评需要登录操作,可以注册一个小号作为爬虫的工具。
爬取完成后得到一个10M的文件夹:
结果文件
对电影评论分词,并绘制词云
# -*- coding:utf-8 -*-import codecsfrom os import pathimport jiebafrom scipy.misc import imreadfrom wordcloud import WordClouddef get_all_keywords(file_name): word_lists = [] # 关键词列表 with codecs.open(file_name, 'r', encoding='utf-8') as f: Lists = f.readlines() # 文本列表 for List in Lists: cut_list = list(jieba.cut(List)) for word in cut_list: word_lists.append(word) word_lists_set = set(word_lists) # 去除重复元素 sort_count = [] word_lists_set = list(word_lists_set) length = len(word_lists_set) print u"共有%d个关键词" % length k = 1 for w in word_lists_set: sort_count.append(w + u':' + unicode(word_lists.count(w)) + u"次 ") print u"%d---" % k + w + u":" + unicode(word_lists.count(w)) + u"次" k += 1 with codecs.open('count_word.txt', 'w', encoding='utf-8') as f: f.writelines(sort_count)
绘制词云
def save_jieba_result(): # 设置多线程切割 jieba.enable_parallel(4) dirs = path.join(path.dirname(__file__), '../pjl_comment.txt') with codecs.open(dirs, encoding='utf-8') as f: comment_text = f.read() cut_text = " ".join(jieba.cut(comment_text)) # 将jieba分词得到的关键词用空格连接成为字符串 with codecs.open('pjl_jieba.txt', 'a', encoding='utf-8') as f: f.write(cut_text)def draw_wordcloud2(): dirs = path.join(path.dirname(__file__), 'pjl_jieba.txt') with codecs.open(dirs, encoding='utf-8') as f: comment_text = f.read() color_mask = imread("/Users/huazi/Desktop/music.jpg") # 读取背景图片 stopwords = [u'就是', u'电影', u'你们', u'这么', u'不过', u'但是', u'什么', u'没有', u'这个', u'那个', u'大家', u'比较', u'看到', u'真是', u'除了', u'时候', u'已经', u'可以'] cloud = WordCloud(font_path="/Users/huazi/Desktop/simsunttc/simsun.ttc", background_color='white', max_words=2000, max_font_size=200, min_font_size=4, mask=color_mask, stopwords=stopwords) word_cloud = cloud.generate(comment_text) # 产生词云 word_cloud.to_file("pjl_cloud.jpg")save_jieba_result()draw_wordcloud2()
碰到的一些问题
因为评论的数据较多,使用jieba分词工具时,时间消耗很大,提高速度的方法, 设置多线程切割: jieba.enable_parallel(4)
WordCloud使用时,对stopwords使用中文,一直没有效果,通过修改源代码函数达到目标:
零基础小白到大神之路,欢迎加裙 四七四五三四九五一,群里有免费python课程,还有大量干货哦
领取专属 10元无门槛券
私享最新 技术干货