首页 > 其他 > 详细

asyncio在爬虫中的使用

时间:2020-02-06 19:32:28      阅读:83      评论:0      收藏:0      [点我收藏+]
# -*- coding: utf-8 -*-
# 协程基础.py

import asyncio import time async def request(url): print("正在请求:", url) # r = await asyncio.sleep(3) time.sleep(3) print("下载成功:", url) c = request("www.baidu.com") # 异步函数返回的协程对象 # 1.实例化事件循环 loop = asyncio.get_event_loop() # 2.任务对象,把协程对象放到任务对象中 task = loop.create_task(c) # 3.把任务对象放到事件循环中 loop.run_until_complete(task)
# -*- coding: utf-8 -*-
# 给任务对象绑定回调.py

import asyncio
import time

async def request(url):
    print("正在请求:", url)
    # r = await asyncio.sleep(3)
    time.sleep(3)
    print("下载成功:", url)
    return 123
c = request("www.baidu.com")  # 异步函数返回的协程对象

# 回调函数的参数是任务对象task,回调在爬虫中是用来解析的
def parse(task):
    print("这是回调函数")
    print("打印结果是协程函数的返回值", task.result())


# 1.实例化事件循环
loop = asyncio.get_event_loop()
# 2.任务对象,把协程对象放到任务对象中
task = loop.create_task(c)
# 给任务对象绑定一个回调函数
task.add_done_callback(parse)

# 3.把任务对象放到事件循环中
loop.run_until_complete(task)
# -*- coding: utf-8 -*-
# 多任务异步协程.py

import asyncio
import time

urls = [www.baidu.com, www.sogou.com, www.sina.com]
start = time.time()

async def request(url):
    print("正在请求:", url)
    # time.sleep(3)  # 需要改成支持异步的代码
    await asyncio.sleep(3)  # 协程对象
    print("下载成功:", url)


loop = asyncio.get_event_loop()
# 任务列表,放置多个任务
tasks = []
for url in urls:
    c = request(url)  # 协程对象
    task = loop.create_task(c)
    tasks.append(task)

loop.run_until_complete(asyncio.wait(tasks))
print(总共耗时:, time.time() - start)
# -*- coding: utf-8 -*-
# 多任务异步协程在爬虫中应用.py


import asyncio
import time
import requests
import aiohttp  # 跟requests的区别就是支持异步请求

# 单线程 + 多任务异步协程
# start = time.time()
# urls = [
#     ‘http://127.0.0.1:5000/bobo‘,
#     ‘http://127.0.0.1:5000/jay‘,
#     ‘http://127.0.0.1:5000/tom‘,
# ]
#
# async def get_pageText(url):
#     print("正在下载", url)
#     page_text = requests.get(url).text     # 不支持异步请求,所以会报错
#     print("下载完毕", url)
#     # 返回给回调函数
#     return page_text
#
#
# loop = asyncio.get_event_loop()
# tasks = []
# for url in urls:
#     c = get_pageText(url)
#     task = loop.create_task(c)
#     tasks.append(task)
# loop.run_until_complete(asyncio.wait(tasks))
#
# print(‘总共耗时:‘, time.time() - start)


start = time.time()
urls = [
    http://127.0.0.1:5000/bobo,  # 页面响应2秒
    http://127.0.0.1:5000/jay,  # 页面响应2秒
    http://127.0.0.1:5000/tom,  # 页面响应2秒
]

# 代理操作的时候
# async with await s.get(url=url,headers=headers,proxy="http://ip:port") as response:
async def get_pageText(url):
    # 开启一个连接请求s
    async with aiohttp.ClientSession() as s:
        # await的使用条件: 请求和响应都存在网络传输,
        # 发送一个连接请求,其他参数跟用request发请求一样比如headers,直接写括号里
        async with await s.get(url=url) as response:
            # 获取响应
            page_text = await response.text()
            # print(page_text)
            # 把page_text传给回调函数进行解析
            return page_text


from lxml import etree
def parse(task):
    # 获取 执行函数调用的结果
    page_text = task.result()

    # # 实例化etree解析对象
    # tree = etree.HTML(page_text)
    # page_data = tree.xpath(‘//*[@id="page"]/a[1]/span[1]/i/@class‘)[0]

    print(page_text, "开始对页面进行解析")


loop = asyncio.get_event_loop()
tasks = []
for url in urls:
    c = get_pageText(url)
    task = loop.create_task(c)
    # 给每一个任务对象绑定回调函数
    task.add_done_callback(parse)
    tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))

print(总共耗时:, time.time() - start)

 

asyncio在爬虫中的使用

原文:https://www.cnblogs.com/kenD/p/12269620.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!