首页 > 其他 > 详细

图片操作scrapy

时间:2021-04-25 00:03:14      阅读:24      评论:0      收藏:0      [点我收藏+]
import scrapy
#####################################多页数据拿取
?
class XihSpider(scrapy.Spider):
   name = ‘xih‘
   # allowed_domains = [‘www.xxx.com‘]
   start_urls = [‘http://www.521609.com/meinvxiaohua/list121.html‘]
   url = ‘http://www.521609.com/meinvxiaohua/list12%d.html‘
   page_num = 2
   def parse(self, response):
       all_li = response.xpath(‘//*[@id="content"]/div[2]/div[2]/ul/li‘)
       for li in all_li:
           # title = li.xpath(‘./a[1]/img/@alt‘).extract_first()
           title = li.xpath(‘./a[2]/text() | ./a[2]/b/text()‘).extract_first()
           print(title)
       if self.page_num<=5:
           new_url = format(self.url%self.page_num)
           self.page_num += 1
           # callback回调函数
           yield scrapy.Request(url=new_url,callback=self.parse)
请求传参操作
import scrapy
?
from selenium import webdriver
from wangyiRro.items import WangyirroItem
class WangyiSpider(scrapy.Spider):
  name = ‘wangyi‘
  # allowed_domains = [‘wwww.163.com‘]
  start_urls = [‘http://www.163.com/‘]
    #解析五大板块
  moderls_uerls = []
  def __init__(self):
      self.bro = webdriver.Chrome()
  def parse(self, response):
      a = response.xpath(‘//*[@id="js_index2017_wrap"]/div[1]/div[2]/div[1]/div[2]/ul/li[2]/a[1]‘)
      # print(‘qwewqeqettttttttttttttt‘,a)
      for i in a:
          a_url = i.xpath(‘./@href‘).extract_first()
          # print(a_url)
      # alist=[1,3]
      #   # 存储三个板块的url
      # for index in alist:
      #     a_url = li_list[index].xpath(‘.a/@href‘).extract_first()
          self.moderls_uerls.append(a_url)
      # 依次对每个板块的url发送请求
      for url in self.moderls_uerls:
          yield scrapy.Request(url=url,callback=self.parse_modle)
?
  def parse_modle(self,response): #解析每个板块页面中对应新闻的标题和新闻详情页的url
?
      div_list = response.xpath(‘//*[@id="stock2016_wrap"]/div/div[3]/div[3]/div[2]/div[2]/div[3]/div/ul/li[1]/div/div‘)
      # print(‘sadawqrrrrrrrrrrrr‘,div_list)
      for div in div_list:
          title = div.xpath(‘./div/div[1]/a/text()‘).extract_first()
          # print(‘sadsdweqeqweqweqweqweqewqeqweqweqweqweqweqweqweqwqewqewewqeqweqwewqeqweqw‘,title)
          title_url = div.xpath(‘./div/div[1]/h3/a/@href‘).extract_first()
          # print(‘sadsadddddddddddsadsadasdasdssdaddddd‘,title_url)
          item = WangyirroItem()
          item[‘title‘] = title
          #对新闻页面里面的内容发送请求
          yield scrapy.Request(url=title_url,callback=self.parse_datail,meta={‘item‘:item})
?
  def parse_datail(self,respons):
      content = respons.xpath(‘//*[@id="content"]/div[3]//text()‘).extract()
      print(‘sadddddddddddda‘,content)
      content = ‘‘.join(content)
      item = respons.mate[‘item‘]
      item[‘content‘] = content
      print(content)
      yield item
?
  def closed(self,spider):
      self.bro.quit()
?
?
?
?
############################## middlewares #############
?
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
?
from scrapy import signals
from time import sleep
from scrapy.http import HtmlResponse
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class WangyirroDownloaderMiddleware:
  # Not all methods need to be defined. If a method is not defined,
  # scrapy acts as if the downloader middleware does not modify the
  # passed objects.
?
  def process_request(self, request, spider):
      # Called for each request that goes through the downloader
      # middleware.
?
      # Must either:
      # - return None: continue processing this request
      # - or return a Response object
      # - or return a Request object
      # - or raise IgnoreRequest: process_exception() methods of
      #   installed downloader middleware will be called
      return None
      # 该方法拦截五大板块对应的响应对象,进行篡改
  def process_response(self, request, response, spider):# spider 爬虫对象
      bro = spider.bro # 获取了在爬虫类中定义的浏览器对象
      # 挑选出指定的响应数据进行篡改
      # 通过url指定request
      # 通过request 指定response
      if request.url in spider.moderls_uerls:
          # response五大板块对应的响应对象
          # 针对定位到的这些response进行篡改
          # 实例化新的响应对象(符合需求:包括动态加载出的新闻数据),替代原来旧的响应对象
          # 如何获取动态加载数据?selenium
            # 基于selenium便捷的获取动态加载数据
          # print(request.url)
          bro.get(request.url)
          sleep(2)
          page_text = bro.page_source #包含了动态加载的新闻数据
          # print(page_text)
          new_respons = HtmlResponse(url=request.url,body=page_text,encoding=‘utf-8‘,request=request)
          return new_respons
      else:
            # response其他请求对应的响应对象
          return response
?
  def process_exception(self, request, exception, spider):
      # Called when a download handler or a process_request()
      # (from other downloader middleware) raises an exception.
?
      # Must either:
      # - return None: continue processing this exception
      # - return a Response object: stops process_exception() chain
      # - return a Request object: stops process_exception() chain
      pass
       
       
?
?
?
图片操作
import scrapy
?
from imagePre.items import ImagepreItem
class ImageSpider(scrapy.Spider):
  name = ‘image‘
  # allowed_domains = [‘www.sdsaasd.cpm‘]
  start_urls = [‘https://sc.chinaz.com/tupian/shanshuitupian.html‘]
  url = ‘https://sc.chinaz.com/tupian/shanshuitupian_%d.html‘
  page_num = 2
  def parse(self, response):
      div_list = response.xpath(‘//*[@id="container"]/div‘)
      for div in div_list:
          url_img = ‘http:‘+div.xpath(‘./div/a/img/@src2‘).extract_first()
          item = ImagepreItem()
          item[‘url_img‘] = url_img
          yield item
      if self.page_num<=5:
          new_url = format(self.url%self.page_num)
          self.page_num += 1
          yield scrapy.Request(url=new_url,callback=self.parse)
?
?
Pipeline
?
# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
?
?
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
?
?
# class ImageprePipeline:
#     def process_item(self, item, spider):
#         return item
?
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class imagesPilines(ImagesPipeline):
?
  #就是可以根据图片地址进行图片数据的请求
  def get_media_requests(self, item, info):
      yield scrapy.Request(item[‘url_img‘])
?
  #指定图片的存储路径
  def file_path(self, request, response=None, info=None, *, item=None):
      name_image = request.url.split(‘/‘)[-1]
      return name_image
  #返回给下一个即将执行的管道类
  def item_completed(self, results, item, info):
      return item
       
       
       
Item
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
?
import scrapy
?
?
class ImagepreItem(scrapy.Item):
  # define the fields for your item here like:
  url_img = scrapy.Field()
?
?
setting
# Scrapy settings for imagePre project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
?
BOT_NAME = ‘imagePre‘
?
SPIDER_MODULES = [‘imagePre.spiders‘]
NEWSPIDER_MODULE = ‘imagePre.spiders‘
?
?
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36‘
?
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
?
LOG_LEVEL = ‘ERROR‘
?
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
?
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
?
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
?
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
?
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
#   ‘Accept-Language‘: ‘en‘,
#}
?
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#   ‘imagePre.middlewares.ImagepreSpiderMiddleware‘: 543,
#}
?
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#   ‘imagePre.middlewares.ImagepreDownloaderMiddleware‘: 543,
#}
?
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#   ‘scrapy.extensions.telnet.TelnetConsole‘: None,
#}
?
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
  ‘imagePre.pipelines.imagesPilines‘: 300,
}
?
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
?
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = ‘httpcache‘
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘
?
#指定图片存储的目录
IMAGES_STORE = ‘./tupianshanshui‘

 

图片操作scrapy

原文:https://www.cnblogs.com/thaimj1314520/p/14698435.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!