import scrapy
#####################################多页数据拿取
?
class XihSpider(scrapy.Spider):
name = ‘xih‘
# allowed_domains = [‘www.xxx.com‘]
start_urls = [‘http://www.521609.com/meinvxiaohua/list121.html‘]
url = ‘http://www.521609.com/meinvxiaohua/list12%d.html‘
page_num = 2
def parse(self, response):
all_li = response.xpath(‘//*[@id="content"]/div[2]/div[2]/ul/li‘)
for li in all_li:
# title = li.xpath(‘./a[1]/img/@alt‘).extract_first()
title = li.xpath(‘./a[2]/text() | ./a[2]/b/text()‘).extract_first()
print(title)
if self.page_num<=5:
new_url = format(self.url%self.page_num)
self.page_num += 1
# callback回调函数
yield scrapy.Request(url=new_url,callback=self.parse)
请求传参操作
import scrapy
?
from selenium import webdriver
from wangyiRro.items import WangyirroItem
class WangyiSpider(scrapy.Spider):
name = ‘wangyi‘
# allowed_domains = [‘wwww.163.com‘]
start_urls = [‘http://www.163.com/‘]
#解析五大板块
moderls_uerls = []
def __init__(self):
self.bro = webdriver.Chrome()
def parse(self, response):
a = response.xpath(‘//*[@id="js_index2017_wrap"]/div[1]/div[2]/div[1]/div[2]/ul/li[2]/a[1]‘)
# print(‘qwewqeqettttttttttttttt‘,a)
for i in a:
a_url = i.xpath(‘./@href‘).extract_first()
# print(a_url)
# alist=[1,3]
# # 存储三个板块的url
# for index in alist:
# a_url = li_list[index].xpath(‘.a/@href‘).extract_first()
self.moderls_uerls.append(a_url)
# 依次对每个板块的url发送请求
for url in self.moderls_uerls:
yield scrapy.Request(url=url,callback=self.parse_modle)
?
def parse_modle(self,response): #解析每个板块页面中对应新闻的标题和新闻详情页的url
?
div_list = response.xpath(‘//*[@id="stock2016_wrap"]/div/div[3]/div[3]/div[2]/div[2]/div[3]/div/ul/li[1]/div/div‘)
# print(‘sadawqrrrrrrrrrrrr‘,div_list)
for div in div_list:
title = div.xpath(‘./div/div[1]/a/text()‘).extract_first()
# print(‘sadsdweqeqweqweqweqweqewqeqweqweqweqweqweqweqweqwqewqewewqeqweqwewqeqweqw‘,title)
title_url = div.xpath(‘./div/div[1]/h3/a/@href‘).extract_first()
# print(‘sadsadddddddddddsadsadasdasdssdaddddd‘,title_url)
item = WangyirroItem()
item[‘title‘] = title
#对新闻页面里面的内容发送请求
yield scrapy.Request(url=title_url,callback=self.parse_datail,meta={‘item‘:item})
?
def parse_datail(self,respons):
content = respons.xpath(‘//*[@id="content"]/div[3]//text()‘).extract()
print(‘sadddddddddddda‘,content)
content = ‘‘.join(content)
item = respons.mate[‘item‘]
item[‘content‘] = content
print(content)
yield item
?
def closed(self,spider):
self.bro.quit()
?
?
?
?
############################## middlewares #############
?
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
?
from scrapy import signals
from time import sleep
from scrapy.http import HtmlResponse
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class WangyirroDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
?
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
?
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
# 该方法拦截五大板块对应的响应对象,进行篡改
def process_response(self, request, response, spider):# spider 爬虫对象
bro = spider.bro # 获取了在爬虫类中定义的浏览器对象
# 挑选出指定的响应数据进行篡改
# 通过url指定request
# 通过request 指定response
if request.url in spider.moderls_uerls:
# response五大板块对应的响应对象
# 针对定位到的这些response进行篡改
# 实例化新的响应对象(符合需求:包括动态加载出的新闻数据),替代原来旧的响应对象
# 如何获取动态加载数据?selenium
# 基于selenium便捷的获取动态加载数据
# print(request.url)
bro.get(request.url)
sleep(2)
page_text = bro.page_source #包含了动态加载的新闻数据
# print(page_text)
new_respons = HtmlResponse(url=request.url,body=page_text,encoding=‘utf-8‘,request=request)
return new_respons
else:
# response其他请求对应的响应对象
return response
?
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
?
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
?
?
?
图片操作
import scrapy
?
from imagePre.items import ImagepreItem
class ImageSpider(scrapy.Spider):
name = ‘image‘
# allowed_domains = [‘www.sdsaasd.cpm‘]
start_urls = [‘https://sc.chinaz.com/tupian/shanshuitupian.html‘]
url = ‘https://sc.chinaz.com/tupian/shanshuitupian_%d.html‘
page_num = 2
def parse(self, response):
div_list = response.xpath(‘//*[@id="container"]/div‘)
for div in div_list:
url_img = ‘http:‘+div.xpath(‘./div/a/img/@src2‘).extract_first()
item = ImagepreItem()
item[‘url_img‘] = url_img
yield item
if self.page_num<=5:
new_url = format(self.url%self.page_num)
self.page_num += 1
yield scrapy.Request(url=new_url,callback=self.parse)
?
?
Pipeline
?
# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
?
?
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
?
?
# class ImageprePipeline:
# def process_item(self, item, spider):
# return item
?
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class imagesPilines(ImagesPipeline):
?
#就是可以根据图片地址进行图片数据的请求
def get_media_requests(self, item, info):
yield scrapy.Request(item[‘url_img‘])
?
#指定图片的存储路径
def file_path(self, request, response=None, info=None, *, item=None):
name_image = request.url.split(‘/‘)[-1]
return name_image
#返回给下一个即将执行的管道类
def item_completed(self, results, item, info):
return item
Item
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
?
import scrapy
?
?
class ImagepreItem(scrapy.Item):
# define the fields for your item here like:
url_img = scrapy.Field()
?
?
setting
# Scrapy settings for imagePre project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
?
BOT_NAME = ‘imagePre‘
?
SPIDER_MODULES = [‘imagePre.spiders‘]
NEWSPIDER_MODULE = ‘imagePre.spiders‘
?
?
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36‘
?
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
?
LOG_LEVEL = ‘ERROR‘
?
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
?
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
?
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
?
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
?
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
# ‘Accept-Language‘: ‘en‘,
#}
?
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# ‘imagePre.middlewares.ImagepreSpiderMiddleware‘: 543,
#}
?
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# ‘imagePre.middlewares.ImagepreDownloaderMiddleware‘: 543,
#}
?
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# ‘scrapy.extensions.telnet.TelnetConsole‘: None,
#}
?
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
‘imagePre.pipelines.imagesPilines‘: 300,
}
?
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
?
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = ‘httpcache‘
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘
?
#指定图片存储的目录
IMAGES_STORE = ‘./tupianshanshui‘
原文:https://www.cnblogs.com/thaimj1314520/p/14698435.html