1、在爬虫文件中解析出图片地址+图片名称封装到item对象提交给管道
2、在管道文件中:
- from scrapy.pipelines.images import ImagesPipeline
- 封装一个管道类,继承与ImagesPipeline
- 重写父类的三个方法:
- get_media_requests
- file_path:只需要返回图片名称
- item_completed
3、在配置文件中添加如下配置:
- IMAGES_STORE = ‘文件夹路径‘
# -*- coding: utf-8 -*- import scrapy from xiaohuaPro.items import XiaohuaproItem class XiaohuaSpider(scrapy.Spider): name = ‘xiaohua‘ # allowed_domains = [‘www.xxx.com‘] start_urls = [‘http://www.521609.com/daxuemeinv/‘] def parse(self, response): #图片地址+名称 li_list = response.xpath(‘//*[@id="content"]/div[2]/div[2]/ul/li‘) for li in li_list: img_src = ‘http://www.521609.com‘+li.xpath(‘./a[1]/img/@src‘).extract_first() img_name = li.xpath(‘./a[1]/img/@alt‘).extract_first()+‘.jpg‘ item = XiaohuaproItem() item[‘img_name‘] = img_name item[‘img_src‘] = img_src yield item
import scrapy from scrapy.pipelines.images import ImagesPipeline class XiaohuaproPipeline(ImagesPipeline): # 对图片数据进行请求发送 # 该方法参数item就是接受爬虫文件提交过来的item def get_media_requests(self, item, info): # meta可以将字典传递给file_path方法 yield scrapy.Request(item[‘img_src‘], meta={‘item‘: item}) # 指定图片存储的路径 def file_path(self, request, response=None, info=None): # 如何获取图片名称 item = request.meta[‘item‘] img_name = item[‘img_name‘] return img_name # 可以将item 传递给下一个即将被执行的管道类 def item_completed(self, results, item, info): return item
原文:https://www.cnblogs.com/wgwg/p/13273967.html