scrapy框架基础:Twsited
scrapy内部基于事件循环的机制实现爬虫的并发。
原来:
url_list = [‘http://www.baidu.com‘,‘http://www.baidu.com‘,‘http://www.baidu.com‘,] for item in url_list: response = requests.get(item) print(response.text)
现在:
from twisted.web.client import getPage, defer from twisted.internet import reactor # 第一部分:代理开始接收任务 def callback(contents): print(contents) deferred_list = [] # [(龙泰,贝贝),(刘淞,宝件套),(呼呼,东北)] url_list = [‘http://www.bing.com‘, ‘https://segmentfault.com/‘,‘https://stackoverflow.com/‘ ] for url in url_list: deferred = getPage(bytes(url, encoding=‘utf8‘)) # (我,要谁) deferred.addCallback(callback) deferred_list.append(deferred) # # 第二部分:代理执行完任务后,停止 dlist = defer.DeferredList(deferred_list) def all_done(arg): reactor.stop() dlist.addBoth(all_done) # 第三部分:代理开始去处理吧 reactor.run()
什么是twisted?
非阻塞:不等待,所有请求同时发出。 我向请求A、请求B、请求C发起连接请求的时候,不等连接返回结果之后再去连下一个,而是发送一个之后,马上发送下一个。
import socket sk = socket.socket() sk.setblocking(False) sk.connect((1.1.1.1,80)) import socket sk = socket.socket() sk.setblocking(False) sk.connect((1.1.1.2,80)) import socket sk = socket.socket() sk.setblocking(False) sk.connect((1.1.1.3,80))
异步:回调。我一旦帮助callback_A、callback_B、callback_F找到想要的A,B,C,我会主动通知他们。
def callback(contents): print(contents)
事件循环: 我,我一直在循环三个socket任务(即:请求A、请求B、请求C),检查他三个状态:是否连接成功;是否返回结果。
scrapy
一.命令
scrapy startproject xx # 创建项目
cd xx # 进入项目目录
scrapy genspider chouti chouti.com # 创建spider
“““编写爬虫”””
scrapy crawl chouti --nolog # 开启爬虫
二. 编写
def parse(self,response):
1. 响应:
# 1.响应 # response封装了响应相关的所有数据: - response.text - response.encoding - response.body
- response.meta[‘depth‘:‘深度‘] - response.request # 当前响应是由那个请求发起;请求中 封装(要访问的url,下载完成之后执行那个函数)
2. 解析
response.css(‘...‘) 返回一个response xpath对象
response.css(‘....‘).extract() 返回一个列表
response.css(‘....‘).extract_first() 提取列表中的元素
def parse_detail(self, response): # items = JobboleArticleItem() # title = response.xpath(‘//div[@class="entry-header"]/h1/text()‘)[0].extract() # create_date = response.xpath(‘//p[@class="entry-meta-hide-on-mobile"]/text()‘).extract()[0].strip().replace(‘·‘,‘‘).strip() # praise_nums = int(response.xpath("//span[contains(@class,‘vote-post-up‘)]/h10/text()").extract_first()) # fav_nums = response.xpath("//span[contains(@class,‘bookmark-btn‘)]/text()").extract_first() # try: # if re.match(‘.*?(\d+).*‘, fav_nums).group(1): # fav_nums = int(re.match(‘.*?(\d+).*‘, fav_nums).group(1)) # else: # fav_nums = 0 # except: # fav_nums = 0 # comment_nums = response.xpath(‘//a[contains(@href,"#article-comment")]/span/text()‘).extract()[0] # try: # if re.match(‘.*?(\d+).*‘,comment_nums).group(1): # comment_nums = int(re.match(‘.*?(\d+).*‘,comment_nums).group(1)) # else: # comment_nums = 0 # except: # comment_nums = 0 # contente = response.xpath(‘//div[@class="entry"]‘).extract()[0] # tag_list = response.xpath(‘//p[@class="entry-meta-hide-on-mobile"]/a/text()‘).extract() # tag_list = [tag for tag in tag_list if not tag.strip().endswith(‘评论‘)] # tags = ",".join(tag_list) # items[‘title‘] = title # try: # create_date = datetime.datetime.strptime(create_date,‘%Y/%m/%d‘).date() # except: # create_date = datetime.datetime.now() # items[‘date‘] = create_date # items[‘url‘] = response.url # items[‘url_object_id‘] = get_md5(response.url) # items[‘img_url‘] = [img_url] # items[‘praise_nums‘] = praise_nums # items[‘fav_nums‘] = fav_nums # items[‘comment_nums‘] = comment_nums # items[‘content‘] = contente # items[‘tags‘] = tags
# title = response.css(‘.entry-header h1::text‘)[0].extract() # create_date = response.css(‘p.entry-meta-hide-on-mobile::text‘).extract()[0].strip().replace(‘·‘,‘‘).strip() # praise_nums = int(response.css(".vote-post-up h10::text").extract_first() # fav_nums = response.css(".bookmark-btn::text").extract_first() # if re.match(‘.*?(\d+).*‘, fav_nums).group(1): # fav_nums = int(re.match(‘.*?(\d+).*‘, fav_nums).group(1)) # else: # fav_nums = 0 # comment_nums = response.css(‘a[href="#article-comment"] span::text‘).extract()[0] # if re.match(‘.*?(\d+).*‘, comment_nums).group(1): # comment_nums = int(re.match(‘.*?(\d+).*‘, comment_nums).group(1)) # else: # comment_nums = 0 # content = response.css(‘.entry‘).extract()[0] # tag_list = response.css(‘p.entry-meta-hide-on-mobile a::text‘) # tag_list = [tag for tag in tag_list if not tag.strip().endswith(‘评论‘)] # tags = ",".join(tag_list) # xpath选择器 /@href /text()
def parse_detail(self, response): img_url = response.meta.get(‘img_url‘,‘‘) item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value(‘url‘,response.url) item_loader.add_value(‘url_object_id‘, get_md5(response.url)) item_loader.add_css(‘date‘, ‘p.entry-meta-hide-on-mobile::text‘) item_loader.add_value("img_url", [img_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("comment_nums", "a[href=‘#article-comment‘] span::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") items = item_loader.load_item() yield items
3. 再次发起请求
yield Request(url=‘xxxx‘,callback=self.parse)
yield Request(url=parse.urljoin(response.url,post_url), meta={‘img_url‘:img_url}, callback=self.parse_detail)
原文:https://www.cnblogs.com/zhangyafei/p/10226853.html