Scrapy 提供了专门下载文件或者图片的Pipeline,下载图片与文件的原理同抓取网页的原理是一样的,所以他们的下载过程支持多线程与异步,十分的高效
首先在settings中配置图片存放路径
IMAGES_STORE = './images'
在item中定义需要的数据结构
class Images360Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
collection = table = "images"
id = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
thumb = scrapy.Field()
定义spider与parse
import scrapy
from urllib.parse import urlencode
from scrapy import Request
from images360.images360.items import Images360Item
class ImagesSpider(scrapy.Spider):
name = 'images'
allowed_domains = ['images.so.com']
start_urls = ['http://images.so.com/']
def start_requests(self):
data = {'ch': 'photography',
'listtype': 'hot', }
base_url = 'http://images.so.com/zj?'
for page in range(1, self.settings.get('MAX_PAGE_SIZE') + 1):
sn = page * 30
data['sn'] = sn
params = urlencode(data)
url = base_url + params
print(url)
yield Request(url, self.parse)
def parse(self, response):
html = json.loads(response.text)
datas = html.get('list', '')
if datas:
for data in datas:
images_item = Images360Item()
images_item['id'] = data.get('imageid', '')
images_item['title'] = data.get('group_title', '')
images_item['url'] = data.get('qhimg_url', '')
images_item['thumb'] = data.get('qhimg_thumb_url', '')
yield images_item
定义项目管道
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class ImagesPipeline(ImagesPipeline):
# 将item中的url取出来 通过Request继续放入到调度器中执行
def get_media_requests(self, item, info):
yield Request(item['url'])
# request对应的是当前下载对象,该函数用于放回 文件名
def file_path(self, request, response=None, info=None):
url = request.url
print('url============', url)
file_name = url.split('/')[-1]
return file_name
# 单个item完成下载时的处理方法
def item_completed(self,results,item,info):
# results为Item对应的下载结果
# print(results)
# [(True, {'url': 'http://p2.so.qhimgs1.com/t01b866193d9b2101de.jpg', 'path': 't01b866193d9b2101de.jpg',
# 'checksum': 'e074b5cbacd22ac38480d84506fedf02'})]
image_path = [x['path'] for ok,x in results if ok]
if image_path:
return item
else:
raise DropItem('image download failed')
注:ImagePipeline的优先级别应该比存入数据库的级别高
原文:https://www.cnblogs.com/mangM/p/10768406.html