实现:
import requests import re import time def get_chapter(aim_url): ‘‘‘ 获取小说章节名称和对应的url地址 :param aim_url:小说目录页url :return:章节名和url的字典 ‘‘‘ ret = requests.get(aim_url) content = ret.content.decode(‘gbk‘) li = re.findall(‘<li><a.*</li>‘,content) #获取小说章节的地址 url_dic = {} for i in li: print(i) chapter = re.search(r‘">.*</a‘,i).group()[2:-3] url = aim_url + re.search(r‘[0-9]{7}.html‘,i).group() url_dic[chapter] = url return url_dic def get_content(chapter_url): ‘‘‘ 获取小说内容 :param chapter_url:章节名和url的字典 :return: ‘‘‘ requests.adapters.DEFAULT_RETRIES = 5 for chapter,url in chapter_url.items(): try: ret = requests.get(url).content.decode(‘gbk‘) except requests.exceptions.ConnectionError: print(‘连接太快了..等等‘) time.sleep(5) ret = requests.get(url).content.decode(‘gbk‘) #匹配正文 ret = ret.replace(‘\n‘,‘‘) content = re.search(r‘<div id="content" class="content">.*chapterpage‘,ret).group()[35:] #替换正文中的html标签 content = content.replace(‘ ‘,‘ ‘) content = content.replace(‘<br />‘, ‘ ‘) content = content.replace(‘'‘, ‘‘) #将内容写入文件中 with open(‘novel.txt‘,mode=‘a‘,encoding=‘utf-8‘) as f: f.write(chapter+‘\n‘) f.write(content) f.write(‘\n\n‘) print(chapter) if __name__ == ‘__main__‘: li = get_chapter(‘https://www.9dxs.com/2/2348/index.html‘) get_content(li)
遇到的问题:
爬取一半时,抛出了requests.exceptions.ConnectionError异常
分析:
解决:
requests.adapters.DEFAULT_RETRIES = 5
try: ret = requests.get(url).content.decode(‘gbk‘) except requests.exceptions.ConnectionError: print(‘连接太快了..等等‘) time.sleep(5) ret = requests.get(url).content.decode(‘gbk‘)
原文:https://www.cnblogs.com/walthwang/p/10452643.html