图片操作scrapy

时间：2021-04-25 00:03:14 阅读：27 评论：0 收藏：0 [点我收藏+]

import scrapy
#####################################多页数据拿取
?
class XihSpider(scrapy.Spider):
    name = ‘xih‘
    # allowed_domains = [‘www.xxx.com‘]
    start_urls = [‘http://www.521609.com/meinvxiaohua/list121.html‘]
    url = ‘http://www.521609.com/meinvxiaohua/list12%d.html‘
    page_num = 2
    def parse(self, response):
        all_li = response.xpath(‘//*[@id="content"]/div[2]/div[2]/ul/li‘)
        for li in all_li:
            # title = li.xpath(‘./a[1]/img/@alt‘).extract_first()
            title = li.xpath(‘./a[2]/text() | ./a[2]/b/text()‘).extract_first()
            print(title)
        if self.page_num<=5:
            new_url = format(self.url%self.page_num)
            self.page_num += 1
            # callback回调函数
            yield scrapy.Request(url=new_url,callback=self.parse)

请求传参操作
import scrapy
?
from selenium import webdriver
from wangyiRro.items import WangyirroItem
class WangyiSpider(scrapy.Spider):
    name = ‘wangyi‘
    # allowed_domains = [‘wwww.163.com‘]
    start_urls = [‘http://www.163.com/‘]
     #解析五大板块
    moderls_uerls = []
    def __init__(self):
        self.bro = webdriver.Chrome()
    def parse(self, response):
        a = response.xpath(‘//*[@id="js_index2017_wrap"]/div[1]/div[2]/div[1]/div[2]/ul/li[2]/a[1]‘)
        # print(‘qwewqeqettttttttttttttt‘,a)
        for i in a:
            a_url = i.xpath(‘./@href‘).extract_first()
            # print(a_url)
        # alist=[1,3]
        #   # 存储三个板块的url
        # for index in alist:
        #     a_url = li_list[index].xpath(‘.a/@href‘).extract_first()
            self.moderls_uerls.append(a_url)
        # 依次对每个板块的url发送请求
        for url in self.moderls_uerls:
            yield scrapy.Request(url=url,callback=self.parse_modle)
?
    def parse_modle(self,response): #解析每个板块页面中对应新闻的标题和新闻详情页的url
?
        div_list = response.xpath(‘//*[@id="stock2016_wrap"]/div/div[3]/div[3]/div[2]/div[2]/div[3]/div/ul/li[1]/div/div‘)
        # print(‘sadawqrrrrrrrrrrrr‘,div_list)
        for div in div_list:
            title = div.xpath(‘./div/div[1]/a/text()‘).extract_first()
            # print(‘sadsdweqeqweqweqweqweqewqeqweqweqweqweqweqweqweqwqewqewewqeqweqwewqeqweqw‘,title)
            title_url = div.xpath(‘./div/div[1]/h3/a/@href‘).extract_first()
            # print(‘sadsadddddddddddsadsadasdasdssdaddddd‘,title_url)
            item = WangyirroItem()
            item[‘title‘] = title
            #对新闻页面里面的内容发送请求
            yield scrapy.Request(url=title_url,callback=self.parse_datail,meta={‘item‘:item})
?
    def parse_datail(self,respons):
        content = respons.xpath(‘//*[@id="content"]/div[3]//text()‘).extract()
        print(‘sadddddddddddda‘,content)
        content = ‘‘.join(content)
        item = respons.mate[‘item‘]
        item[‘content‘] = content
        print(content)
        yield item
?
    def closed(self,spider):
        self.bro.quit()
?
?
?
?
##############################  middlewares  #############
?
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
?
from scrapy import signals
from time import sleep
from scrapy.http import HtmlResponse
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class WangyirroDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
?
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
?
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
        # 该方法拦截五大板块对应的响应对象，进行篡改
    def process_response(self, request, response, spider):# spider 爬虫对象
        bro = spider.bro # 获取了在爬虫类中定义的浏览器对象
        # 挑选出指定的响应数据进行篡改
        # 通过url指定request
        # 通过request 指定response
        if request.url in spider.moderls_uerls:
            # response五大板块对应的响应对象
            # 针对定位到的这些response进行篡改
            # 实例化新的响应对象（符合需求：包括动态加载出的新闻数据），替代原来旧的响应对象
            # 如何获取动态加载数据？selenium
              # 基于selenium便捷的获取动态加载数据
            # print(request.url)
            bro.get(request.url)
            sleep(2)
            page_text = bro.page_source #包含了动态加载的新闻数据
            # print(page_text)
            new_respons = HtmlResponse(url=request.url,body=page_text,encoding=‘utf-8‘,request=request)
            return new_respons
        else:
              # response其他请求对应的响应对象
            return response
?
    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.
?
        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass
        
        
?
?
?

图片操作
import scrapy
?
from imagePre.items import ImagepreItem
class ImageSpider(scrapy.Spider):
    name = ‘image‘
    # allowed_domains = [‘www.sdsaasd.cpm‘]
    start_urls = [‘https://sc.chinaz.com/tupian/shanshuitupian.html‘]
    url = ‘https://sc.chinaz.com/tupian/shanshuitupian_%d.html‘
    page_num = 2
    def parse(self, response):
        div_list = response.xpath(‘//*[@id="container"]/div‘)
        for div in div_list:
            url_img = ‘http:‘+div.xpath(‘./div/a/img/@src2‘).extract_first()
            item = ImagepreItem()
            item[‘url_img‘] = url_img
            yield item
        if self.page_num<=5:
            new_url = format(self.url%self.page_num)
            self.page_num += 1
            yield scrapy.Request(url=new_url,callback=self.parse)
?
?
Pipeline
?
# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
?
?
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
?
?
# class ImageprePipeline:
#     def process_item(self, item, spider):
#         return item
?
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class imagesPilines(ImagesPipeline):
?
    #就是可以根据图片地址进行图片数据的请求
    def get_media_requests(self, item, info):
        yield scrapy.Request(item[‘url_img‘])
?
    #指定图片的存储路径
    def file_path(self, request, response=None, info=None, *, item=None):
        name_image = request.url.split(‘/‘)[-1]
        return name_image
    #返回给下一个即将执行的管道类
    def item_completed(self, results, item, info):
        return item
        
        
        
Item
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
?
import scrapy
?
?
class ImagepreItem(scrapy.Item):
    # define the fields for your item here like:
    url_img = scrapy.Field()
?
?
setting
# Scrapy settings for imagePre project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
?
BOT_NAME = ‘imagePre‘
?
SPIDER_MODULES = [‘imagePre.spiders‘]
NEWSPIDER_MODULE = ‘imagePre.spiders‘
?
?
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36‘
?
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
?
LOG_LEVEL = ‘ERROR‘
?
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
?
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
?
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
?
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
?
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
#   ‘Accept-Language‘: ‘en‘,
#}
?
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    ‘imagePre.middlewares.ImagepreSpiderMiddleware‘: 543,
#}
?
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    ‘imagePre.middlewares.ImagepreDownloaderMiddleware‘: 543,
#}
?
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    ‘scrapy.extensions.telnet.TelnetConsole‘: None,
#}
?
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   ‘imagePre.pipelines.imagesPilines‘: 300,
}
?
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
?
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = ‘httpcache‘
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘
?
#指定图片存储的目录
IMAGES_STORE = ‘./tupianshanshui‘

图片操作scrapy

原文：https://www.cnblogs.com/thaimj1314520/p/14698435.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)