spider
import scrapy from selenium import webdriver from selenium.webdriver.chrome.options import Options from wangyiPro.items import WangyiproItem """ 爬取网易国内和国际新闻标题和内容 """ class WangyiSpider(scrapy.Spider): name = ‘wangyi‘ # allowed_domains = [‘www.163.com‘] start_urls = [‘https://news.163.com/domestic/‘,‘https://news.163.com/world/‘] def __init__(self): options = webdriver.ChromeOptions() options.add_argument(‘--window-position=0,0‘); # chrome 启动初始位置 options.add_argument(‘--window-size=1080,800‘); # chrome 启动初始大小 self.browser = webdriver.Chrome(executable_path=‘C://xx//chromedriver.exe‘ ,chrome_options=options) def parse(self, response): div_list = response.xpath(‘//div[@class="ndi_main"]/div‘) for div_item in div_list: title = div_item.xpath(‘./div/div[1]/h3/a/text()‘).extract_first() new_detail_url=div_item.xpath(‘./div/div[1]/h3/a/@href‘).extract_first() item = WangyiproItem() item[‘title‘] = title # 对于新闻详情页发起request yield scrapy.Request(url= new_detail_url,callback=self.parse_detail,meta={‘item‘:item}) # 请求传参item # 解析新闻内容 def parse_detail(self,response): content = response.xpath(‘//*[@id="endText"]//text()‘).extract() content = ‘‘.join(content) item = response.meta[‘item‘] item[‘content‘] = content.strip() yield item def closed(self,spider): self.browser.quit()
middleware
from scrapy import signals from time import sleep from scrapy.http import HtmlResponse class WangyiproDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None # 拦截响应对象进行篡改 def process_response(self, request, response, spider): # Called with the response returned from the downloader. #挑选指定的响应对象进行篡改 #通过url指定request #通过request指定response #spider爬虫对象 bro = spider.browser # 获取爬虫类定义的浏览器对象 if request.url in spider.start_urls: #response # 进行篡改 实例化新的响应对象(包含动态加载的新闻数据)替代原来的旧响应对象 # 基于seleium便捷获取动态数据 bro.get(request.url) sleep(3) bro.execute_script(‘window.scrollTo(0, document.body.scrollHeight)‘) sleep(1) page_text = bro.page_source # 包含了动态加载对象 new_response = HtmlResponse(url=request.url,body=page_text,encoding="utf-8",request=request) return new_response else: # response # 其他请求 # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info(‘Spider opened: %s‘ % spider.name)
pipeline文件
import pymysql class WangyiproPipeline(object): # 构造方法 def __init__(self): self.conn = None # 定义一个文件描述符属性 self.cursor = None self.num = 0 # 下列都是在重写父类的方法: # 开始爬虫时,执行一次 def open_spider(self, spider): self.conn = pymysql.Connect(host=‘192.168.xx.xx‘, port=3306, user=‘root‘, password=‘xx‘, db=‘xx_db‘, charset=‘utf8‘) print(‘爬虫数据库开始‘) # 专门处理item对象 # 因为该方法会被执行调用多次,所以文件的开启和关闭操作写在了另外两个只会各自执行一次的方法中。 def process_item(self, item, spider): author = item[‘title‘] content = item[‘content‘] self.cursor = self.conn.cursor() try: self.cursor.execute(‘insert into qiubai values(%s,%s)‘, (author, content)) self.conn.commit() except Exception as e: print(e,content[0,20]) self.conn.rollback() return item def close_spider(self, spider): print(‘爬虫数据库结束‘) self.cursor.close() self.conn.close()
items文件
class WangyiproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() content = scrapy.Field() pass
setting配置
USER_AGENT = ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36‘ # 伪装请求载体身份 # Obey robots.txt rules # ROBOTSTXT_OBEY = True ROBOTSTXT_OBEY = False #可以忽略或者不遵守robots协议 #只显示指定类型的日志信息 LOG_LEVEL=‘ERROR‘ # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, # ‘Accept-Language‘: ‘en‘, #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # ‘wangyiPro.middlewares.WangyiproSpiderMiddleware‘: 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { ‘wangyiPro.middlewares.WangyiproDownloaderMiddleware‘: 543, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # ‘scrapy.extensions.telnet.TelnetConsole‘: None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { ‘wangyiPro.pipelines.WangyiproPipeline‘: 300, }
原文:https://www.cnblogs.com/xiao-apple36/p/12635470.html