首页 > 其他 > 详细

并发爬虫

时间:2020-04-08 16:32:33      阅读:68      评论:0      收藏:0      [点我收藏+]
  1 # -*- coding=utf-8 -*-
  2 # software: scrapy
  3 # datetime:2020/4/8 2:48 下午
  4 import gevent
  5 from gevent import monkey
  6 monkey.patch_all()
  7 import requests
  8 from lxml import etree
  9 import time
 10 from concurrent.futures.thread import ThreadPoolExecutor
 11 from concurrent.futures.process import ProcessPoolExecutor
 12 
 13 
 14 def timer(func):
 15     def warpper(*args, **kwargs):
 16         start_time = time.time()
 17         ret = func(*args, **kwargs)
 18         print(f"耗时:{func}", time.time() - start_time)
 19         return ret
 20 
 21     return warpper
 22 
 23 
 24 class OrderSpider(object):
 25 
 26     def __init__(self):
 27         self.url = "http://www.bewindoweb.com/dwg.php"
 28 
 29     def request(self):
 30         res = requests.get(self.url)
 31         if res.status_code == 200:
 32             return self.parse(res.text)
 33 
 34     def parse(self, html):
 35         node = etree.HTML(html)
 36         return node.xpath("//div[@class=‘card-dwg-hrefc‘]/a/div/div[2]/text()")
 37 
 38 
 39 @timer
 40 def run(function, n):
 41     """
 42     顺序抓取
 43     :param function:
 44     :param n:
 45     :return:
 46     """
 47     a = None
 48     for i in range(n):
 49         a = function()
 50     print(a)
 51 
 52 
 53 def callback(future):
 54     """
 55     回调函数
 56     :param future:
 57     :return:
 58     """
 59     return future.result()
 60 
 61 
 62 @timer
 63 def thread_run(function, n):
 64     """
 65     多线程抓取
 66     :param function:
 67     :param n:
 68     :return:
 69     """
 70     pools = ThreadPoolExecutor(6)
 71     for i in range(n):
 72         result = pools.submit(function)
 73         result.add_done_callback(callback)
 74     print(result.result())
 75     pools.shutdown(wait=True)
 76 
 77 
 78 @timer
 79 def process_run(function, n):
 80     """
 81     多进程抓取
 82     :param function:
 83     :param n:
 84     :return:
 85     """
 86     pools = ProcessPoolExecutor(6)
 87     for i in range(n):
 88         result = pools.submit(function)
 89         result.add_done_callback(callback)
 90     print(result.result())
 91     pools.shutdown(wait=True)
 92 
 93 
 94 @timer
 95 def gevent_run(function, n):
 96     """
 97     多协程抓取
 98     :param function:
 99     :param n:
100     :return:
101     """
102     tasks = []
103     for i in range(n):
104         tasks.append(gevent.spawn(function))
105     gevent.joinall(tasks)
106     a = None
107     for task in tasks:
108         a = task.value
109     print(a)
110 
111 
112 if __name__ == __main__:
113     n = 100
114     order_spider = OrderSpider()
115     run(order_spider.request, n)
116     thread_run(order_spider.request, n)
117     process_run(order_spider.request, n)
118     gevent_run(order_spider.request, n)

 

并发爬虫

原文:https://www.cnblogs.com/dreamall/p/12660370.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!