threading
:python3
版本之后的新线程函数requests
: 自带函数,用于请求网络地址os
: 自带函数,用于操作文件相关openpyxl
: 开源第三方的excel导出的库,需要手动下载pip install openpyxlBeautifulSoup
:html
代码美化工具Element
部分# -*- coding:UTF-8 -*-
import requests,sys
from bs4 import BeautifulSoup
class downloader(object):
def __init__(self):
self.server = 'http://bodboy.gitee.io/'
self.target = 'http://bodboy.gitee.io/blog/'
self.names = []
self.urls =[]
self.nums = 0
def getUrls(self):
req = requests.get(url=self.target)
req.encoding ='utf-8'
html = req.text
div = BeautifulSoup(html)
div_list = div.find_all('header' ,class_ ='article-header')
self.nums = len(div_list)
for each in div_list:
list_url = each.find_all('a' ,class_ ='article-title')
print(list_url)
url = list_url[0]
self.urls.append(self.server + url.get('href'))
self.names.append(url.string)
def write(self ,name ,path ,text):
write_flag = True
with open(path,'a',encoding='utf-8') as f:
f.writelines(text)
f.write('\n\n')
if __name__ == "__main__":
dl = downloader()
dl.getUrls()
print("开始下载文件....")
for i in range(dl.nums):
dl.write(dl.names[i], '博客文件.txt',dl.names[i] + '\r' + dl.urls[i])
sys.stdout.write('已下载:%.3f%%' % float(i/dl.nums) + '\r')
sys.stdout.flush()
print('文件下载完成')
import threading
import requests
import os
from openpyxl import Workbook
class capturePc():
def __init__(self):
# 接口地址
self.base_url = 'http://api.newibao.com/web/essay/publicEssayList'
# 参数
self.params ={
'page':1,
'size':64
}
# 模拟浏览器 如果需要登录 记上session 需要添加 token
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
'Accept-Encoding': 'gzip, deflate'
}
def get_list(self):
req = requests.get(url=self.base_url,headers=self.headers,params=self.params)
# 数据转换
data = req.json()
print()
try :
if data['data']['list'] :
data = data['data']['list']
return data
except :
print("no data find")
return None
def get_img_list(self):
res = self.get_list()
imgUrls = []
if res :
for e in res :
if e['picUrl']:
urls = e['picUrl']
imgUrls += urls
print("图片列表为:",imgUrls)
return imgUrls
def get_excel_data(self):
res = self.get_list()
excel_list =[]
if res :
for e in res:
cell = [e['addTime'],e['brief'],e['columnName'],e['details'],e['name'],e['updateTime']]
excel_list.append(cell)
print(excel_list)
return excel_list
def download_img(self,name):
# 判断是否有文件夹
if not os.path.exists(name):
os.mkdir(name)
print("文件夹{}创建成功".format(name))
imgList = self.get_img_list()
names =0
if imgList :
for i in imgList:
names += 1
threading.Thread(target=self.download, args=(names, i,name)).start()
def download(self,name,image_url,path):
print('开始下载:', name)
content = requests.get(image_url).content
path = '%s/%s.jpg' % (path, name)
with open(path, 'wb') as f:
f.write(content)
print('下载完成', name)
def export_excel(self):
# 创建 文件对象
wb = Workbook()
# 设置excel名称
data_fileName = '蓝海图文数据.xlsx'
# 新建一个表
ws = wb.active
# 设置表头
header =['创建时间','标题','分类','详情','名称','更新时间']
for row in range(len(header)):
c = row +1
ws.cell(row=1, column = c,value =header[row])
# 填写表中内容
listIndex = self.get_excel_data()
if listIndex :
for imn in range(len(listIndex)):
ws.append(listIndex[imn])
wb.save(filename = data_fileName)
print("写入成功")
if __name__ == "__main__":
a = capturePc()
# filename = str(input('请输入文件名,必须为英文名称'))
a.export_excel()