在Scrapy中使用file Pipeline获取下载后的文件路径,可以按照以下步骤进行操作:
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1,
}
import scrapy
class MyItem(scrapy.Item):
file_urls = scrapy.Field()
file_paths = scrapy.Field()
from scrapy import Spider
from myproject.items import MyItem
class MySpider(Spider):
name = 'myspider'
start_urls = ['http://example.com']
def parse(self, response):
item = MyItem()
item['file_urls'] = [response.urljoin('path/to/file')]
yield item
from scrapy.pipelines.files import FilesPipeline
from urllib.parse import urlparse
class MyFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
path = urlparse(request.url).path
return path
def item_completed(self, results, item, info):
if 'file_urls' in item:
file_paths = [x['path'] for ok, x in results if ok]
if file_paths:
item['file_paths'] = file_paths
return item
ITEM_PIPELINES = {
'myproject.pipelines.MyFilesPipeline': 2,
}
现在,当Scrapy下载文件时,文件将保存在指定的路径中,并且文件路径将存储在file_paths字段中。您可以在Spider中访问和处理这些文件路径。
领取专属 10元无门槛券
手把手带您无忧上云