本文以'allitebooks'网站对象,实现电子书标题、作者、简介批量获取,并以json和csv文件形式存入本地。
代码使用python的requests模块和xpath、bs4两种方式,并以json和csv格式转存本地。
分成4步:1,发请求;2,解析数据;3,保存数据;4,json转换成csv。
Code:1,使用xpath;2,使用bs4。
# 1,使用xpath:
import requests
from lxml import etree
import json
import csv
import time
class BookSpider(object):
def __init__(self):
self.base_url = 'http://www.allitebooks.com/page/{}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/73.0.3683.86 Chrome/73.0.3683.86 Safari/537.36'}
self.data_of_book_dict = []
# 1,构建所有的url:
def get_url_list(self):
url_list = []
for i in range(1, 10):
url = self.base_url.format(i)
url_list.append(url)
return url_list
# 2,发请求:
def send_request(self, url):
data = requests.get(url, headers = self.headers).content.decode()
return data
# 3,解析数据:
def parse_xpath_data(self, data):
parse_data = etree.HTML(data)
# 1,解析出所有的书:
book_list = parse_data.xpath('//div[@class="main-content-inner clearfix"]/article')
# print(len(book_list))
# 2,解析出每本书的信息:
for book in book_list:
book_dict = {}
# 1,书名:
book_dict['book_name'] = book.xpath('.//h2[@class="entry-title"]//text()') # .:表示当前路径, //:表示跨节点
#print(book_name)
# 2,该书的url:获取该路径下面的属性src
book_dict['book_img_url'] = book.xpath('./div[@class="entry-thumbnail hover-thumb"]/a/img/@src')[0] # book_list中元素的xpath路径article(相当于本地路径)下面的标签div
#print(book_img_url)
# 3,书的作者:
book_dict['book_author'] = book.xpath('.//h5[@class="entry-author"]/a/text()')[0]
#print(book_author)
# 4,书的简介:
book_dict['book_info'] = book.xpath('.//div[@class="entry-summary"]/p/text()')[0] # 也可以写//text()替换/p/text()
#print(book_info)
self.data_of_book_dict.append(book_dict)
# 4,保存数据:
def save_data(self):
json.dump(self.data_of_book_dict, open('ebook_xpath.json', 'w'))
# 5,启动:
def start(self):
url_list = self.get_url_list()
# 循环遍历发送请求:
for url in url_list:
print(url)
data = self.send_request(url)
self.parse_xpath_data(data)
self.save_data()
start = time.time()
BookSpider().start()
# 将json数据转换成csv格式:列表数据转csv
# 1,读 创建文件:
json_fp = open('ebook_xpath.json', 'r')
csv_fp = open('ebook_xpath.csv', 'w')
# 2,提出表头 表内容:
data_list = json.load(json_fp)
sheet_title = data_list[0].keys() # 方法一
sheet_data = []
for data in data_list:
sheet_data.append(data.values())
# print(sheet_data)
# 3,csv写入器:
writer = csv.writer(csv_fp)
# 4,写入表头:
writer.writerow(sheet_title)
# 5,写入内容:
writer.writerows(sheet_data)
# 6,关闭两个文件:
json_fp.close()
csv_fp.close()
end = time.time()
print('cost time: ', end-start, 's')
# 2,使用bs4:
import requests
from bs4 import BeautifulSoup
import json
import csv
import time
class BookSpider(object):
def __init__(self):
self.base_url = 'http://www.allitebooks.com/page/{}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/73.0.3683.86 Chrome/73.0.3683.86 Safari/537.36'}
self.data_of_book_dict = []
# 1,构建所有的url:
def get_url_list(self):
url_list = []
for i in range(1, 10):
url = self.base_url.format(i)
url_list.append(url)
return url_list
# 2,发请求:
def send_request(self, url):
data = requests.get(url, headers = self.headers).content.decode()
return data
# 3,解析数据:
def parse_bs4_data(self, data):
bs4_data = BeautifulSoup(data, 'lxml')
# 1,解析出所有的书:
book_list = bs4_data.select('article')
#print(len(book_list))
# 2,解析出每本书的信息:
for book in book_list:
book_dict = {}
# 1,书名:
book_dict['book_name'] = book.select_one('.entry-title').get_text()
# 2,该书的url:获取该路径下面的属性src
book_dict['book_img_url'] = book.select_one('.attachment-post-thumbnail').get('src')
# 3,书的作者:
book_dict['book_author'] = book.select_one('.entry-author').get_text()[3:]
# 4,书的简介:
book_dict['book_info'] = book.select_one('.entry-summary').get_text()
self.data_of_book_dict.append(book_dict)
#print(book_dict)
# 4,保存数据:
def save_data(self):
json.dump(self.data_of_book_dict, open('ebook_bs4.json', 'w'))
# 5,启动:
def start(self):
url_list = self.get_url_list()
# 循环遍历发送请求:
for url in url_list:
print(url)
data = self.send_request(url)
self.parse_bs4_data(data)
self.save_data()
start = time.time()
BookSpider().start()
# 将json数据转换成csv格式:列表数据转csv
# 1,读 创建文件:
json_fp = open('ebook_bs4.json', 'r')
csv_fp = open('ebook_bs4.csv', 'w')
# 2,提出表头 表内容:
data_list = json.load(json_fp)
sheet_title = data_list[0].keys() # 方法一
sheet_data = []
for data in data_list:
sheet_data.append(data.values())
# print(sheet_data)
# 3,csv写入器:
writer = csv.writer(csv_fp)
# 4,写入表头:
writer.writerow(sheet_title)
# 5,写入内容:
writer.writerows(sheet_data)
# 6,关闭两个文件:
json_fp.close()
csv_fp.close()
end = time.time
print('cost time: ', end-start, 's')
本文分享自 MiningAlgorithms 微信公众号,前往查看
如有侵权,请联系 cloudcommunity@tencent.com 删除。
本文参与 腾讯云自媒体同步曝光计划 ,欢迎热爱写作的你一起参与!