终于考完试了可以做些自己的东西了。小说爬取代码如下,这次用到了 xpath。
import requests
from lxml import html
import re
import time
def get_url():
url = "https://www.ybdu.com/xiaoshuo/0/910/"
#header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"}
res = requests.get(url)
seletor = html.fromstring(res.content)
urls = []
mulu_list = []
img = seletor.xpath('//div/div/ul/li/a/@href')
mu_lu = seletor.xpath('//div/div/div/ul[@class="mulu_list"]/li/a/text()')
for i in img:
k = str(url)+str(i)
urls.append(k)
for m in mu_lu:
mulu_list.append(m)
return urls[5:],mulu_list
#print(get_url())
def dowmload():
count = 0
for link in get_url()[0]:
res = requests.get(link)
seletor = html.fromstring(res.content)
nei_rong = seletor.xpath('//*[@id="htmlContent"]/text()')
j = len(nei_rong)
i = 0
out = ""
while i
out1 = "".join(nei_rong[i].split())
out = out+str(out1)
i = i + 1
#print(out)
print (get_url()[1][count])
f = open(get_url()[1][count-1] + ".txt", 'a')
print ("正在载入第" + str(count))
f.write(out)
f.close()
count = count + 1
dowmload()
#print (get_url()[0])
加关注
公众号:锦河工作室
领取专属 10元无门槛券
私享最新 技术干货