首页 > 其他 > 详细

21天打造分布式爬虫-下载汽车之家图片(九)

时间:2019-12-25 15:13:29      阅读:117      评论:0      收藏:0      [点我收藏+]

9.1.下载汽车之家图片

新建项目

scrapy startproject bmx

scrapy genspider bmx5 "car.autohome.com.cn"

bmx5.py

# -*- coding: utf-8 -*-
import scrapy
from bmx.items import BmxItem

class Bmx5Spider(scrapy.Spider):
    name = bmx5
    allowed_domains = [car.autohome.com.cn]
    start_urls = [https://car.autohome.com.cn/pic/series/159.html]

    def parse(self, response):
        uiboxs = response.xpath("//div[@class=‘uibox‘]")[1:]
        for uibox in uiboxs:
            category = uibox.xpath(".//div[@class=‘uibox-title‘]/a/text()").get()
            urls = uibox.xpath(".//ul/li/a/img/@src").getall()
            # for url in urls:
            #     url = "https:" + url
            #     print(url)
            urls = list(map(lambda url:response.urljoin(url),urls))
            items = BmxItem(category=category,image_urls=urls)
            yield items

items.py

# -*- coding: utf-8 -*-

import scrapy

class BmxItem(scrapy.Item):
    category = scrapy.Field()
    #保存图片
    image_urls = scrapy.Field()
    images = scrapy.Field()

pipelines.py

自定义保存图片的路劲

# -*- coding: utf-8 -*-
from scrapy.pipelines.images import ImagesPipeline
from bmx import settings
import os

class BMXImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        #这个方法是在发送下载请求之前调用
        request_objs = super(BMXImagesPipeline, self).get_media_requests(item,info)
        for request_obj in request_objs:
            request_obj.item = item
        return request_objs

    def file_path(self, request, response=None, info=None):
        #这个方法是在图片将要被保存的时候调用,用来获取图片存储的路劲
        path = super(BMXImagesPipeline, self).file_path(request,response,info)
        category = request.item.get(category)
        image_store = settings.IMAGES_STORE
        category_path = os.path.join(image_store,category)
        if not os.path.exists(category_path):
            os.mkdir(category_path)
        image_name = path.replace("full/","")
        image_path = os.path.join(category_path,image_name)
        return image_path

settings.py

import os

ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 1

DEFAULT_REQUEST_HEADERS = {
  Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,
  Accept-Language: en,
    User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36,
}

#使用自定义的pipeline
ITEM_PIPELINES = {
   # ‘bmx.pipelines.BmxPipeline‘: 300,
   bmx.pipelines.BMXImagesPipeline: 1,
}

#图片下载的路劲
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)),images)

start.py

from scrapy import cmdline

cmdline.execute("scrapy crawl bmx5".split())

结果

技术分享图片

 

21天打造分布式爬虫-下载汽车之家图片(九)

原文:https://www.cnblogs.com/gaidy/p/12096737.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!