# -*- coding: utf-8 -*-
from urllib.parse import urlencode
import json
import scrapy
import os
import re
import urllib.request
class SougouimgSpider(scrapy.Spider):
name = 'sougouimg'
allowed_domains = ['pic.sogou.com']
start_urls = ['https://pic.sogou.com/']
def parse(self, response):
page = 1
endpage = 5 # 终点页
keywords = r'哆啦A梦'
for page in range(1,endpage):
yield scrapy.Request(self.geturl(keywords,page), callback=self.sougou)
def sougou(self,response):
# 获取get参数
# print(response.text)
data = response.text
js = json.loads(data)
for list in js['items']:
img_url = list['pic_url']
self.savve(img_url)
def geturl(self, keywords, page): # 传入关键字,页码
param = {
'query': keywords,
'mode': '1',
'start': page*48,
'reqType': 'ajax',
'reqFrom': 'result',
'tn': '0'
}
ps = urlencode(param)
url = 'https://pic.sogou.com/pics?' + ps
return url
def savve(self,img_url):
path = os.path.dirname(os.path.abspath(__file__))+"\\搜狗图片"
dir = os.path.exists(path)
if not dir:
os.makedirs(path)
reg = re.compile('[^\/]+$')
# 保存图片
title= reg.findall(img_url)[0]
sougou = path + "\\" + title
try:
urllib.request.urlretrieve(img_url, sougou)
except Exception as e:
print(title+"下载失败")
finally:
print(title+"下载完毕")
by浅枫沐雪