定义装饰器函数
1 def run_forever(func): 2 def wrapper(obj): 3 while True: 4 func(obj) 5 return wrapper
类初始化
def __init__(self, page): self.max_page = page self.url_head = ‘https://www.qiushibaike.com‘ self.url_mid = ‘text/page/‘ self.url_detail = ‘/‘ self.count = 0 self.url_queue = Queue() # 页面url队列 self.get_url_content_queue = Queue() # 单个页面队列 self.url_queue_all = Queue() self.page_url_list = []
定义类方法
def add_url_to_queue(self): for i in range(1, self.max_page): self.url_queue.put(self.url_head + self.url_detail + self.url_mid + str(i) + self.url_detail) @run_forever def get_page_url_to_list(self): url = self.url_queue.get() response = requests.get(url) if response.status_code != 200: self.url_queue.put(url) print(‘url {}验证失败 重新写入‘.format(url)) else: html = etree.HTML(response.text) url_list = html.xpath(‘//a[@class="contentHerf"]/@href‘) for url in url_list: self.url_queue_all.put(self.url_head + url) self.url_queue.task_done() @run_forever def get_url_to_content_queue(self): url = self.url_queue_all.get() print(url) self.get_url_content_queue.put(url) self.url_queue_all.task_done() @run_forever def get_content(self): url = self.get_url_content_queue.get() try: response = requests.get(url, timeout=1) if response.status_code != 200: self.get_url_content_queue.put(url) else: html = etree.HTML(response.text) title = html.xpath(‘//h1[@class="article-title"]/text()‘) contents = html.xpath(‘//div[@class="content"]/text()‘) with open(‘qiushi.txt‘, ‘a‘, encoding=‘utf8‘) as p: for x in title: p.write("TITLE:" + x) p.write(‘\n‘) for i in contents: p.write(i + ‘\n‘) p.write(‘\n‘) response.close() self.count += 1 print("下载完成数:{}".format(self.count)) self.get_url_content_queue.task_done() except: print("url truble:{}".format(url)) self.get_url_content_queue.put(url) def run_sue_more_task(self, func, count=1): for i in range(0, count): t = Thread(target=func) t.setDaemon(True) t.start() def run(self): self.add_url_to_queue() self.run_sue_more_task(self.get_page_url_to_list, 3) self.run_sue_more_task(self.get_url_to_content_queue, 3) self.run_sue_more_task(self.get_content, 5) self.url_queue.join() self.get_url_content_queue.join() self.url_queue_all.join()
创建实例,调用方法
if __name__ == ‘__main__‘: qbs = get_qiushibaike(12) qbs.run()
ps:爬虫有风险,封ip需谨慎,线程一时爽,封号火葬场
原文:https://www.cnblogs.com/argos/p/13210444.html