仅用与备忘录 ____movie.py import scrapy from moviePro.items import MovieproItem
class MovieSpider(scrapy.Spider): name = ‘movie’ #allowed_domains = [‘www.xxx.com’] start_urls = [‘http://www.4567kan.com/frim/index1.html’] url=‘http://www.4567kan.com/frim/index1-%d.html’ pageNum=2 def parse(self, response): li_list = response.xpath(’/html/body/div[1]/div/div/div/div[2]/ul/li’) for li in li_list: title=li.xpath(’./div/a/@title’).extract_first() detail_url=‘http://www.4567kan.com’+li.xpath(’./div/a/@href’).extract_first() item=MovieproItem() item[‘title’]=title #对详情页url发起请求,callback的self.parse用于反馈首页 #meta作用为可以将meta字典传递给callback yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={‘item’:item}) #拿到前四页页面源码数据 if self.pageNum<5: new_url=format(self.url%self.pageNum) self.pageNum+=1 yield scrapy.Request(url=new_url,callback=self.parse) #parse_detail用于解析详情页的数据 def parse_detail(self,response): #接受传递过来的meta,实现传递参数 item=response.meta[‘item’] desc=response.xpath(’/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]’).extract_first() item[‘desc’]=desc yield item ____________________item.py import scrapy
class MovieproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title=scrapy.Field() desc=scrapy.Field() ________________________pipeline.py
class MovieproPipeline: def process_item(self, item, spider): print(item) return item _______________________setting.py ITEM_PIPELINES = { ‘moviePro.pipelines.MovieproPipeline’: 300, } BOT_NAME = ‘moviePro’
SPIDER_MODULES = [‘moviePro.spiders’] NEWSPIDER_MODULE = ‘moviePro.spiders’
USER_AGENT = ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
ROBOTSTXT_OBEY = False LOG_LEVEL=‘ERROR’ 持久化储存略————————————
发布者:全栈程序员栈长,转载请注明出处:https://javaforall.cn/159333.html原文链接:https://javaforall.cn