数据抓取
import asyncio
import aiohttp
from lxml import etree
import re
from collections import namedtuple
Args = namedtuple(‘Args‘,[‘city‘,‘year‘,‘month‘])
async def work(args):
url = "http://www.tianqihoubao.com/lishi/%s/month/%d%02d.html" % (args.city, args.year, args.month)
headers = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘}
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, timeout=1000, verify_ssl=False) as response:
html = await response.text()
ethtml = etree.HTML(html)
result = [re.sub("\r\n *","",item.text) for item in ethtml.xpath("//table[@class=‘b‘]/tr/td") if not re.fullmatch("\r\n *",item.text)]
return result
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(work(Args(‘wuhan‘, 2018, month))) for month in range(1,6)]
loop.run_until_complete(asyncio.wait(tasks))
for task in tasks:
print(task.result())
原文:https://www.cnblogs.com/plyonfly/p/11391848.html