(被正则表达式难倒)
1 import urllib.request 2 import urllib.error 3 import re 4 import os 5 6 url = input(‘请输入要获取的网页(quanshu网):‘) 7 def get(url): 8 try: 9 headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36‘} 10 urls = urllib.request.Request(url,headers = headers) 11 html = urllib.request.urlopen(urls).read() 12 html = html.decode(‘gbk‘) 13 14 reg = r‘<li><a href="(.*?)" title=".*?">(.*?)</a></li>‘ 15 regs = re.findall(reg,html) 16 17 for i in regs: 18 url_data = i[0] 19 except urllib.error.HTTPError as reason: 20 print(‘出错了!错误的原因是:‘,reason) 21 input(‘‘) 22 23 urls_2 = urllib.request.Request(url_data,headers = headers) 24 urls_2_open = urllib.request.urlopen(urls_2).read() 25 urls_2_open = urls_2_open.decode(‘gbk‘) 26 27 reg_2 = r‘</script> (.*?)<script type="text/javascript"> 28 regs_2 = re.findall(reg_2,urls_2_open) 29 print(regs_2) 30 31 ‘‘‘ 32 with open(‘盗墓笔记.TXT‘,‘w‘) as g: 33 g.write(regs_2) 34 ‘‘‘ 35 36 37 if __name__==‘__main__‘: 38 get(url)
其中reg_2正则表达式格式错误,以后慢慢完善吧...
原文:https://www.cnblogs.com/lyn686/p/12991239.html