 
1 import aiohttp 2 import asyncio 3 import async_timeout 4 from urllib.parse import urljoin,urldefrag 5 6 root_url = ‘http://python/org/‘ # 开始的url 7 crawled_urls,url_hub = [], [root_url] 8 headers = {‘user-agent‘: ‘Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.2.15 Version/10.10‘} 9 10 async def get_body(url): 11 async with aiohttp.ClientSession() as session: 12 try: 13 with async_timeout.timeout(10): #超时时间的设定 14 async with session.get(url,headers=headers) as response: 15 if response.status == 200: 16 html = await response.text() 17 return {‘error‘:‘‘,‘html‘:html,‘url‘:url} 18 else: 19 return {‘error‘:response.status,‘html‘:‘‘,‘url‘:url} 20 except Exception as err: 21 return {‘error‘:response.status,‘html‘:‘‘,‘url‘:url} 22 23 async def handle_task(task_id,work_queue): 24 while not work_queue.empty(): 25 queue_url = await work_queue.get() 26 if not queue_url in crawled_urls: 27 28 body = await get_body(queue_url) 29 if not body[‘error‘]: 30 crawled_urls.append(queue_url) 31 parse(body) 32 else: 33 print(‘{}爬取失败‘.format(queue_url)) 34 35 36 #解析返回的数据 37 def parse(body): 38 pass 39 40 41 42 def remove_fragment(url): 43 pure_url, frag = urldefrag(url) 44 return pure_url 45 46 #解析html,拼接新的url 47 def get_urls(html): 48 new_urls = [url.split(‘"‘)[0] for url in str(html).replace("‘", ‘"‘).split(‘href="‘)[1:]] 49 return [urljoin(root_url, remove_fragment(new_url)) for new_url in new_urls] 50 51 if __name__ == ‘__main__‘: 52 q = asyncio.Queue() #初始化一个异步的队列 53 [q.put_nowait(url) for url in url_hub] #从初始的url队列中遍历,把url放入到队列中 54 loop = asyncio.get_event_loop() 55 tasks = [handle_task(task_id, q) for task_id in range(3)] #3个并发 56 loop.run_until_complete(asyncio.wait(tasks)) 57 loop.close() 58 for u in crawled_urls: 59 print(u) 60 print(‘-‘ * 30) 61 print(len(crawled_urls))
原文:https://www.cnblogs.com/zhongshuiping/p/10172362.html