# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from urllib.parse import urljoin
class BookSpider(scrapy.Spider):
name = 'book'
allowed_domains = ['dangdang.com']
start_urls = ['http://category.dangdang.com/cp01.54.92.01.00.00.html']
page_url = None
def parse(self, response):
# 获取到所有有效li 获取最后一页时发现xpath发生改变 使用模糊匹配
for each in response.xpath('//ul[contains(@id, "component_5")]/li'):
# 标题
title = each.xpath("p[1]/a/text()").extract()
# 价格
price = each.xpath("p[3]/span[1]/text()").extract()
# 评论数
star = each.xpath("p[4]/a/text()").extract()
# 详情介绍
detail = each.xpath("p[2]/text()").extract()
if not detail:
detail = ["本书暂无描述、、、、、、"]
print('detail', detail)
print("title", title)
print('price', price)
print('star', star)
# 获取下一页url
self.page_url = response.xpath(
'//div[@class="paging"]//ul//li[10]//a/@href').extract()
# 当快结束时下一页xpath发生改变
if not self.page_url:
self.page_url = response.xpath(
'//div[@class="paging"]//ul//li[8]//a/@href').extract()
# page_url 是一个数组
for next_url in self.page_url:
yield Request(urljoin("http://category.dangdang.com", next_url), callback=self.parse)
# -\*- coding: utf-8 -\*-
# Scrapy settings for BookSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# [https://doc.scrapy.org/en/latest/topics/settings.html](https://doc.scrapy.org/en/latest/topics/settings.html)
# [https://doc.scrapy.org/en/latest/topics/downloader-middleware.html](https://doc.scrapy.org/en/latest/topics/downloader-middleware.html)
# [https://doc.scrapy.org/en/latest/topics/spider-middleware.html](https://doc.scrapy.org/en/latest/topics/spider-middleware.html)
# 爬虫名字
BOT\_NAME = 'BookSpider'
# 爬虫模块路径
SPIDER\_MODULES = 'BookSpider.spiders'
NEWSPIDER\_MODULE = 'BookSpider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# 客户端user-agent 请求头
#USER\_AGENT = 'BookSpider (+[http://www.yourdomain.com](http://www.yourdomain.com))'
# Obey robots.txt rules
# 禁止爬虫配置 robots.txt 是遵循 Robot协议 的一个文件,它保存在网站的服务器中,它的作用是,告诉搜索引擎爬虫,
# 本网站哪些目录下的网页 不希望 你进行爬取收录。在Scrapy启动后,会在第一时间访问网站的 robots.txt 文件,然后决定该网站的爬取范围
# 在某些情况下我们想要获取的内容恰恰是被 robots.txt 所禁止访问的。所以,某些时候,我们就要将此配置项设置为 False ,拒绝遵守 Robot协议
ROBOTSTXT\_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# 并发请求数
# 当有CONCURRENT\_REQUESTS,没有DOWNLOAD\_DELAY 时,服务器会在同一时间收到大量的请求
# 当有CONCURRENT\_REQUESTS,有DOWNLOAD\_DELAY 时,服务器不会在同一时间收到大量的请求
#CONCURRENT\_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See [https://doc.scrapy.org/en/latest/topics/settings.html#download-delay](https://doc.scrapy.org/en/latest/topics/settings.html#download-delay)
# See also autothrottle settings and docs
# 延迟下载秒数
#DOWNLOAD\_DELAY = 3
# The download delay setting will honor only one of:
# 单域名访问并发数,并且延迟下次秒数也应用在每个域名
#CONCURRENT\_REQUESTS\_PER\_DOMAIN = 16
# 单IP访问并发数,如果有值则忽略:CONCURRENT\_REQUESTS\_PER\_DOMAIN,并且延迟下次秒数也应用在每个IP
#CONCURRENT\_REQUESTS\_PER\_IP = 16
# Disable cookies (enabled by default)
# 是否支持cookie,cookiejar进行操作cookie
#COOKIES\_ENABLED = False
# Disable Telnet Console (enabled by default)
# Telnet用于查看当前爬虫的信息,操作爬虫等...
# 使用telnet ip port ,然后通过命令操作
# TELNETCONSOLE\_ENABLED = True
# TELNETCONSOLE\_HOST = '127.0.0.1'
# TELNETCONSOLE\_PORT = 6023,
# Override the default request headers:
# 默认请求头
#DEFAULT\_REQUEST\_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,\*/\*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See [https://doc.scrapy.org/en/latest/topics/spider-middleware.html](https://doc.scrapy.org/en/latest/topics/spider-middleware.html)
#SPIDER\_MIDDLEWARES = {
# 'BookSpider.middlewares.BookspiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See [https://doc.scrapy.org/en/latest/topics/downloader-middleware.html](https://doc.scrapy.org/en/latest/topics/downloader-middleware.html)
#DOWNLOADER\_MIDDLEWARES = {
# 'BookSpider.middlewares.BookspiderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See [https://doc.scrapy.org/en/latest/topics/extensions.html](https://doc.scrapy.org/en/latest/topics/extensions.html)
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See [https://doc.scrapy.org/en/latest/topics/item-pipeline.html](https://doc.scrapy.org/en/latest/topics/item-pipeline.html)
# 定义pipeline处理请求
#ITEM\_PIPELINES = {
# 'BookSpider.pipelines.BookspiderPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See [https://doc.scrapy.org/en/latest/topics/autothrottle.html](https://doc.scrapy.org/en/latest/topics/autothrottle.html)
#AUTOTHROTTLE\_ENABLED = True
# The initial download delay
#AUTOTHROTTLE\_START\_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE\_MAX\_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE\_TARGET\_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE\_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See [https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings](https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings)
#HTTPCACHE\_ENABLED = True
#HTTPCACHE\_EXPIRATION\_SECS = 0
#HTTPCACHE\_DIR = 'httpcache'
#HTTPCACHE\_IGNORE\_HTTP\_CODES = []
#HTTPCACHE\_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有