说起来Python,你能想到的是什么呢?大数据?Django?小程序?人工智能?爬虫?等等等等 Python作为一门入门简单适合于大众的编程语言,小至小学生,大至大学生,都在学习Python的编程知识,今天博主就给大家带来一篇关于Python的好玩例子---使用Python爬虫下载小说
所谓爬虫,就是取模拟Http请求,然后将返回回来的页面数据进行处理分析,拿到我们想要的内容;今天带大家爬的是一家比较良心的小说网站---
https://www.dingdiann.com/
,这个网站通过博主实测,不会封ip,不会限制你访问,不会有验证码,适合用来割韭菜,废话不多说,直接上源码。
BeautifulSoup4
pip3 install beautifulsoup4
requests
pip3 install requests
book.py
import re import sys
import requests
import time
from bs4 import BeautifulSoup
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
class BookSpider():
'''爬取顶点小说网小说'''
def __init__(self):
self.headers = {
'Host':'www.dingdiann.com',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
}
self.chapter_url_list = list()
self.chapter_name_list = list()
self.book_info = dict()
def get_book_url(self, book_name, author_name):
'''获取要爬取书籍的详情页'''
url = 'https://www.dingdiann.com/searchbook.php'
data = {
'keyword':book_name
}
result = requests.get(url, headers=self.headers, params=data, verify=False).text
soup = BeautifulSoup(result, 'lxml')
book_name_list = soup.find_all(name='span', attrs={'class':'s2'})
book_author_list = soup.find_all(name='span', attrs={'class':'s4'})
book_name_list.pop(0)
book_author_list.pop(0)
for candidate_name in book_name_list:
book_info_list = list()
name = str(candidate_name.a.string)
book_url = str(candidate_name.a.get('href'))
book_info_tuple = (name, book_url)
book_info_list.append(book_info_tuple)
author = str(book_author_list[0].string)
if author in self.book_info.keys():
self.book_info[author].append(book_info_tuple)
book_author_list.pop(0)
else:
self.book_info[author] = book_info_list
book_author_list.pop(0)
if self.book_info[author_name]:
for info in self.book_info[author_name]:
if info[0] == book_name:
url = info[1]
print('书籍已经找到,您要找的书籍为 ' + book_name + ':' + author_name)
print('3S 后将开始下载~~~')
time.sleep(3)
return url
else:
print('抱歉,书籍未找到,请确认书籍作者及名称是否正确~~~')
def get_book_info(self, url):
'''获取书籍的章节列表和地址'''
all_url = 'https://www.dingdiann.com' + url
result = requests.get(all_url, headers=self.headers, verify=False).text
soup = BeautifulSoup(result, 'lxml')
div = soup.find_all(id='list')[0]
chapter_list = div.dl.contents
for text in chapter_list :
text = str(text)
content = re.findall('<a href="' + url + '(.*?)" style="">(.*?)</a>.*?', text)
if content:
chapter_url = all_url + content[0][0]
chapter_name = content[0][1]
self.chapter_url_list.append(chapter_url)
self.chapter_name_list.append(chapter_name)
for i in range(12):
self.chapter_url_list.pop(0)
self.chapter_name_list.pop(0)
def get_chapter_content(self, name, url):
'''获取小说每章内容'''
try:
result = requests.get(url, headers=self.headers, verify=False).text
except:
print(name + "下载失败~~~")
return False
else:
soup = BeautifulSoup(result, 'lxml')
div = soup.find_all(id='content')[0]
div = str(div)
result = re.findall('<div id="content">(.*?)<script>', div, re.S)[0].strip()
result = re.sub('<br/>', '\n', result)
return result
def save_book(self, book_name):
'''保存小说'''
for chapter_name in self.chapter_name_list:
while True:
chapter_content = self.get_chapter_content(chapter_name, self.chapter_url_list[0])
if chapter_content:
with open(book_name + ".txt", 'a') as f:
f.write(chapter_name)
f.write("\n")
f.write(chapter_content)
f.write("\n")
self.chapter_url_list.pop(0)
print(chapter_name + "已经下载完成")
break
def run(self, book_name, url):
self.get_book_info(url)
self.save_book(book_name)
def main(book_name, author_name):
book = BookSpider()
url = book.get_book_url(book_name, author_name)
book.run(book_name, url)
if __name__ == "__main__":
main(sys.argv[1], sys.argv[2])
使用说明:脚本需要输入两个参数,参数1为 小说名称,参数2为 作者名称,之后便会将采集到的内容保存在本地,例如:
python3 book.py 天珠变 唐家三少
1. 本脚本采集的小说数据来自
顶点小说网
,只提供数据采集服务,不提供任何贩卖服务 2. 数据采集自https://www.dingdiann.com/
,感谢网站管理员的慷慨支持,希望大家多多支持正版。
本文分享自 Tyrant Lucifer 微信公众号,前往查看
如有侵权,请联系 cloudcommunity@tencent.com 删除。
本文参与 腾讯云自媒体同步曝光计划 ,欢迎热爱写作的你一起参与!