首页 > 编程语言 > 详细

使用多线程爬取 糗事百科 前十页段子

时间:2020-06-29 22:14:05      阅读:73      评论:0      收藏:0      [点我收藏+]

定义装饰器函数

1 def run_forever(func):
2     def wrapper(obj):
3         while True:
4             func(obj)
5     return wrapper

类初始化

    def __init__(self, page):
        self.max_page = page
        self.url_head = https://www.qiushibaike.com
        self.url_mid = text/page/
        self.url_detail = /
        self.count = 0
   
        self.url_queue = Queue()  # 页面url队列
        self.get_url_content_queue = Queue()  # 单个页面队列
        self.url_queue_all = Queue()
        self.page_url_list = []

定义类方法

 def add_url_to_queue(self):
        for i in range(1, self.max_page):
            self.url_queue.put(self.url_head + self.url_detail + self.url_mid + str(i) + self.url_detail)

  

    @run_forever
    def get_page_url_to_list(self):
        url = self.url_queue.get()
        response = requests.get(url)
        if response.status_code != 200:
            self.url_queue.put(url)
            print(url {}验证失败 重新写入.format(url))
        else:
            html = etree.HTML(response.text)
            url_list = html.xpath(//a[@class="contentHerf"]/@href)

            for url in url_list:
                self.url_queue_all.put(self.url_head + url)
            self.url_queue.task_done()

    @run_forever
    def get_url_to_content_queue(self):

        url = self.url_queue_all.get()
        print(url)
        self.get_url_content_queue.put(url)
        self.url_queue_all.task_done()

    @run_forever
    def get_content(self):
        url = self.get_url_content_queue.get()
        try:
            response = requests.get(url, timeout=1)
            if response.status_code != 200:
                self.get_url_content_queue.put(url)
            else:
                html = etree.HTML(response.text)
                title = html.xpath(//h1[@class="article-title"]/text())
                contents = html.xpath(//div[@class="content"]/text())
                with open(qiushi.txt, a, encoding=utf8) as p:
                    for x in title:
                        p.write("TITLE:" + x)
                        p.write(\n)
                    for i in contents:
                        p.write(i + \n)
                    p.write(\n)
                response.close()
                self.count += 1
                print("下载完成数:{}".format(self.count))
                self.get_url_content_queue.task_done()
        except:
            print("url truble:{}".format(url))
            self.get_url_content_queue.put(url)

    def run_sue_more_task(self, func, count=1):
        for i in range(0, count):
            t = Thread(target=func)
            t.setDaemon(True)
            t.start()

    def run(self):
        self.add_url_to_queue()
        self.run_sue_more_task(self.get_page_url_to_list, 3)
        self.run_sue_more_task(self.get_url_to_content_queue, 3)
        self.run_sue_more_task(self.get_content, 5)
        self.url_queue.join()
        self.get_url_content_queue.join()
        self.url_queue_all.join()

创建实例,调用方法
if __name__ == __main__:
    qbs = get_qiushibaike(12)
    qbs.run()

ps:爬虫有风险,封ip需谨慎,线程一时爽,封号火葬场



 

使用多线程爬取 糗事百科 前十页段子

原文:https://www.cnblogs.com/argos/p/13210444.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!