首页 > 编程语言 > 详细

<爬虫>多线程爬取

时间:2019-07-07 16:10:13      阅读:92      评论:0      收藏:0      [点我收藏+]

一、线程回顾

import time
import threading

‘‘‘一、一个主线程‘‘‘
# def sing():
#     for i in range(1,6):
#         print(‘come baby 跟我一起 嗨 嗨 嗨 !!!‘)
#         time.sleep(1)
#
# def dance():
#     for i in range(1,6):
#         print(‘恰恰 肚皮 钢管舞 哈哈哈哈哈 ...... ‘)
#         time.sleep(1)

# def main():
#     sing()
#     dance()

# if __name__ == ‘__main__‘:
#     main()

‘‘‘二、面向过程创建线程:一个主线程,两个子线程‘‘‘
# def sing(a):
#     for i in range(1,6):
#         print(‘当前线程:%s ...come %s 跟我一起 嗨 嗨 嗨 !!!‘ %(threading.current_thread().name,a))
#         time.sleep(1)

# def dance(a):
#     for i in range(1,6):
#         print(‘当前线程:%s ... 恰恰 肚皮 钢管舞 %s你要哪一种 ‘ %(threading.current_thread().name,a))
#         time.sleep(1)

# def main():
#     print(‘...联欢晚会现在开始...‘)
#     #创建唱歌线程
#     a = ‘悟空‘
#     t_sing = threading.Thread(target=sing,name=‘唱歌‘,args=(a,))
#
#     # 创建跳舞线程
#     t_dance = threading.Thread(target=dance, name=‘跳舞‘,args=(a,))
#
#     #启动线程
#     t_sing.start()
#     t_dance.start()
#
#     #让主线程等待子线程执行完毕
#     t_sing.join()
#     t_dance.join()
#
#     print(‘晚会结束,各回各家‘)
# if __name__ == ‘__main__‘:
#     main()

‘‘‘三、面向对象创建线程‘‘‘

#写一个类,继承threading.Thread
class SingThread(threading.Thread):
    def __init__(self,name,a):
        super().__init__()
        self.name = name
        self.a = a

    def run(self):
        print("线程名:%s  参数:%s" %(self.name,self.a))
        for i in range(1, 6):
            print(爱江山更爱美人...)
            time.sleep(1)

class DanceThread(threading.Thread):
    def __init__(self, name, a):
        super().__init__()
        self.name = name
        self.a = a

    def run(self):
        print("线程名:%s  参数:%s" % (self.name, self.a))
        for i in range(1, 6):
            print(蹦擦擦,蹦擦擦...)
            time.sleep(1)

def main():
    #创建线程
    t_sing = SingThread(,八戒)
    t_dance = DanceThread(,悟能)

    #启动线程
    t_sing.start()
    t_dance.start()

    #让主线程等待子线程执行完毕
    t_sing.join()
    t_dance.join()

if __name__ == __main__:
    main()

二、队列

from queue import Queue

#创建队列
q = Queue(5)     #5个位子
print(q.empty())   #判断是否为空

#存入数据
q.put(浓眉哥)
q.put(勒布朗)
q.put(丹尼*格林)
q.put(库兹马)
q.put(麦基)
print(q.full())    #判断是否满
print(q.qsize())    #返回队列大小
# q.put(‘波普‘,False)    #如果队列满了,直接报错
# q.put(‘波普‘,True,3)   #如果队列满了,等待3秒还没有空位,报错

#获取数据:先进先出
print(q.get())
print(q.get())
print(q.get())
print(q.get())
print(q.get())
# q.get(‘波普‘,False)    #如果队列为空,直接报错
# q.get(‘波普‘,True,3)   #如果队列为空,等待3秒还是空,报错

三、多线程爬虫

import time
import threading
from queue import Queue
import requests
from lxml import  etree
import json

#存放采集线程
crawl_thread_list = []
#存放解析线程
parse_thread_list = []

def create_queue():
    #创建页码队列
    page_queue = Queue()
    for page in range(1,6):
        page_queue.put(page)

    # 创建内容队列
    data_queue = Queue()

    return  page_queue,data_queue

class CrawlThread(threading.Thread):
    def __init__(self,name,page_queue,data_queue):
        super(CrawlThread,self).__init__()
        self.name = name
        self.page_queue = page_queue
        self.data_queue = data_queue
        self.url = http://www.fanjian.net/jiantu-{}
        self.headers = {User-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) 
                         AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36}

    def run(self):
        print(%s启动...... %self.name)
        while 1:
            if self.page_queue.empty():
                break
            #从队列取出页码
            page = self.page_queue.get()
            #拼接url
            url = self.url.format(page)
            #发送请求,拿到响应
            r = requests.get(url=url,headers=self.headers)
            #将响应内容放到data_queue
            self.data_queue.put(r.text)
            break
        print(%s结束...... % self.name)

class ParseThread(threading.Thread):
    def __init__(self,name,data_queue,fp,lock):
        super(ParseThread, self).__init__()
        self.name = name
        self.data_queue = data_queue
        self.fp = fp
        self.lock = lock

    def parse_content(self,data):
        tree = etree.HTML(data)
        ‘‘‘先查找所有的li,再从li下查找图片标题和src‘‘‘
        li_list = tree.xpath(//ul[@class="cont-list"]/li)
        items = []
        for l in li_list:
            # 获取图片标题
            img_title = l.xpath(//h2/a/text())[0]
            #获取图片url
            img_url = tree.xpath(//div[@class="cont-list-main"]/p/img/@data-src)[0]
            item = {标题,img_title,
                    链接,img_url}
            items.append(item)

        #写入文件
        self.lock.acquire()      #上锁
        for item in items:
            self.fp.write(str(item))
        self.lock.release()      #解锁

    def run(self):
        while 1:

            print(%s启动...... % self.name)
            #从data_queue中取出一页数据
            data = self.data_queue.get()
            #解析内容
            self.parse_content(data)


def create_crawl_thread(page_queue,data_queue):
    crawl_name = [采集1号,采集2号,采集3号]
    for name in crawl_name:
        #创建子线程
        t_crawl = CrawlThread(name,page_queue,data_queue)
        #保存到列表
        crawl_thread_list.append(t_crawl)

def create_parse_thread(data_queue,fp,lock):
    parse_name = [解析1号, 解析2号, 解析3号]
    for name in parse_name:
        # 创建子线程
        t_parse = ParseThread(name,data_queue,fp,lock)
        # 保存到列表
        parse_thread_list.append(t_parse)

def main():
    # 创建队列
    page_queue,data_queue = create_queue()

    #打开一个文件
    fp = open(jiantu.txt,a,encoding=utf8)

    #创建锁
    lock = threading.Lock()

    #创建采集线程
    create_crawl_thread(page_queue,data_queue)
    #创建解析线程
    create_parse_thread(data_queue,fp,lock)

    # 启动采集线程
    for t in crawl_thread_list:
        t.start()
    # 启动解析线程
    for t in parse_thread_list:
        t.start()

    # 让主线程等待子线程执行完毕
    for t in crawl_thread_list:
        t.join()
    for t in parse_thread_list:
        t.join()

    #关闭文件
    fp.close()

    print(主线程执行完毕!)

if __name__ == __main__:
    main()

 

<爬虫>多线程爬取

原文:https://www.cnblogs.com/Finance-IT-gao/p/11146517.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!