需求:手机抓包和下载图片(图片重命名)
1.3 手机设置代理服务器
使用命令ipconfig在windows上查看猎豹免费WiFi的产生的ip
手机设置代理服务器
2.Letvlive.py
import scrapy
import json
from Letv.items import LetvItem
# LetvliveSpider名字可以任意,继承scrapy.Spider,基本爬虫
class LetvliveSpider(scrapy.Spider):
# 爬虫名称,在当前项目中名字不能重复发
name = 'Letvlive'
# 爬取的网站,只能在这个范围内容,如果注释掉,没有域名的限制,所以的网站都可以爬
allowed_domains = ['letv.com']
page = 1
pre = "http://dynamic.live.app.m.letv.com/android/dynamic.php?luamod=main&mod=live&ctl=liveHuya&act=channelList&pcode=010210000&version=7.17&channelId=2168&pages="
suf = "&country=CN&provinceid=1&districtid=9&citylevel=1&location=%E5%8C%97%E4%BA%AC%E5%B8%82%7C%E6%9C%9D%E9%98%B3%E5%8C%BA&lang=chs®ion=CN"
# start_urls里面的链接不受allowed_domains这里面的现在
start_urls = [pre + str(page) + suf]
def parse(self, response):
json_text = response.text
# 把json_text 转换成python_dict
python_dict = json.loads(json_text)
for item in python_dict["body"]["result"]:
letvItem = LetvItem()
# 获取昵称
nick = item["nick"]
image = item["screenshot"]
letvItem["nick"] = nick
letvItem["image"] = image
print(letvItem)
# 传递给pipelines(管道)
yield letvItem
if python_dict.get("header").get("status") == "1":
self.page += 1
new_url = self.pre + str(self.page) + self.suf
# 会有相同的url链接,这个链接请求了,就不去请求
# 把所以添加的链接,做去重处理,请求,当再次添加相同的链接进入的时候,判断请求过了,就不请求了
# 把添加的,没有重复的请求后,爬虫结束了
yield scrapy.Request(new_url, callback=self.parse)
3.pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
# 保存图片
import json
import os
from Letv.settings import IMAGES_STORE
# from scrapy.utils.project import get_project_settings
class LetvImagePipeline(ImagesPipeline):
# IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
# 添加请求图片的路径
def get_media_requests(self, item, info):
# 图片下载路径
image = item["image"]
# 把图片路径添加到scrapy引擎里面,让对应的下载器帮我们下载图片
yield scrapy.Request(image)
# 当图片下载完成后,会调用的方法,并且把下载后的路径,回传到这个方法里
def item_completed(self, results, item, info):
print("results===", results)
image = [x["path"] for ok, x in results if ok][0]
print(image)
# 把图片的名字重命名
old_image_name = IMAGES_STORE + "/" + image
# ./images/黑作坊丶小美儿.jpg
new_image_name = IMAGES_STORE + "/" + item["nick"] + ".jpg"
print("old_image_name==", old_image_name)
print("new_image_name==", new_image_name)
# 重命名
os.rename(old_image_name, new_image_name)
print(image)
item["image_path"] = new_image_name
return item
# 默认是处理文本
class LetvPipeline(object):
# 爬虫开始执行的时候调用
def open_spider(self, spider):
self.file = open(spider.name + ".json", "w")
def process_item(self, item, spider):
python_dict = dict(item)
# pyhton 字典-->pyhton str
json_str = json.dumps(python_dict, ensure_ascii=False) + "\n"
self.file.write(json_str)
return item
# 当爬虫结束的时候调用
def close_spider(self, spider):
self.file.close()
4.settings.py
# 不遵循爬虫协议
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'Letv.pipelines.LetvPipeline': 301, # 保存文本
'Letv.pipelines.LetvImagePipeline': 300, # 保存图片
}
# 图片保存的路径,一定要写,否则不去下载图片,要写对
IMAGES_STORE = "./images"
5.运行文件 ---start.py
from scrapy import cmdline
cmdline.execute("scrapy crawl Letvlive".split())