爬虫代码:
1 import scrapy 2 from selenium import webdriver 3 4 class WangyiSpider(scrapy.Spider): 5 name = ‘wangyi‘ 6 # allowed_domains = [‘www.xxx.com‘] 7 start_urls = [‘https://news.163.com/‘] 8 9 def __init__(self): 10 self.bro = webdriver.Chrome(r‘D:\爬虫相关\资料\驱动程序\chromedriver_win32\chromedriver.exe‘) 11 12 def parse(self, response): 13 #解析国内国际军事航空对应的url 14 li_list=response.xpath(‘//div[@class="ns_area list"]/ul/li‘) 15 #国内国际军事航空对应的索引 16 index_url=[3,4,6,7] 17 #存储四个板块对应的li标签 18 news_list=[] 19 for i in index_url: 20 news_list.append(li_list[i]) 21 #解析获取板块的url 22 for li in news_list: 23 url=li.xpath(‘./a/@href‘).extract_first() 24 yield scrapy.Request(url=url,callback=self.parse_news) 25 26 def parse_news(self,response): 27 print(‘xxx:‘,response.xpath(‘/html/body/div[1]/div[3]/div[4]/div[1]/div/div/ul/li/div/div[3]/div[1]/h3/a/text()‘).extract_first()) 28 def closed(self,spider): 29 self.bro.quit()
middewares中间件代码:
1 from scrapy import signals 2 from scrapy.http import HtmlResponse 3 4 5 6 class WangyiproDownloaderMiddleware(object): 7 def process_response(self, request, response, spider): 8 if request.url in [‘http://news.163.com/air/‘,‘http://war.163.com/‘,‘http://news.163.com/world/‘,‘http://news.163.com/domestic/‘]: 9 spider.bro.get(request.url) 10 page_text=spider.bro.page_source#获取数据源 11 return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding=‘utf-8‘,request=request) 12 return response
原文:https://www.cnblogs.com/duanhaoxin/p/10138865.html