首页 > 编程语言 > 详细

多线程爬虫

时间:2020-02-28 13:33:01      阅读:62      评论:0      收藏:0      [点我收藏+]
import requests
import threading
import pymongo
from threading import Lock from queue import Queue from lxml import etree # 爬虫类,负责采集数据的,(创建父线程类) class CrawlThread(threading.Thread): def __init__(self, name, pageQueue, dataQueue): # 爬虫类重写父类方法,但是又得调父类的一些方法,所有调用下父类方法 super().__init__() # 实例属性的初始化与赋值爬虫名字 self.threadname = name # 实例属性的初始化与赋值url队列 self.pageQueue = pageQueue # 实例属性的初始化与赋值headers,直接写死 self.headers = { User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 } self.dataQueue = dataQueue def run(self): base_url = https://www.xiaohua.com/duanzi?page=%s while 1: try: # block = True表示堵塞,没有东西时,一直等,只到等到有东西,False相反 page = self.pageQueue.get(block=False) url = base_url%page print(%s正在采集数据%self.threadname) # 响应数据 response = requests.get(url=url, headers=self.headers) # put方法:Queue.put(item,[block[, timeout]]):将item消息写入队列,block默认值为True self.dataQueue.put(response.text) print(%s提交数据完毕%self.threadname) except: break # 解析类,负责数据解析,(创建父线程类) class ParseThread(threading.Thread): def __init__(self, name, dataQueue, lock): super().__init__() self.threadname = name self.dataQueue = dataQueue self.lock = lock def run(self): while 1: try: html = self.dataQueue.get(block=False) print(%s正在解析数据%self.threadname) # 调用方法并传参到具体的解析过程 self.parse(html) print(%s解析数据完毕%self.threadname) except: break def parse(self, html): # 具体的解析过程 tree = etree.HTML(html) div_list = tree.xpath(//div[@class="one-cont"]) for div in div_list: item = {} author = div.xpath(./div/div/a/i/text())[0] item[author] = author # 上锁 with self.lock: self.save(item) def save(self, item): # 建立数据库连接,数据库存储 conn = pymongo.MongoClient(localhost, 27017) db = conn.jokes_dxc_author table = db.xhua table.insert_one(item) def main(): # 实例化队列,存放url pageQueue = Queue() for i in range(1, 101): pageQueue.put(i) # 存放脏数据 dataQueue = Queue() # 创建锁 lock = Lock() # 开启爬虫线程(爬虫名字) crawlList = [爬虫1号,爬虫2号,爬虫3号] # 循环赋值爬虫名字 ThreadCrawl = [] for var in crawlList: c = CrawlThread(var, pageQueue, dataQueue) c.start() ThreadCrawl.append(c) for var in ThreadCrawl: var.join() # 开启解析线程 parseList = [解析一号,解析二号,解析三号] ThreadParse = [] for var in parseList: p = ParseThread(var, dataQueue, lock) p.start() ThreadParse.append(p) for p in ThreadParse: p.join() if __name__ == __main__: main()

 

多线程爬虫

原文:https://www.cnblogs.com/weiwuhu/p/12376530.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!