1 # _author: Jolly 2 # date: 2019/8/30 3 4 import requests 5 import re 6 7 headers = { 8 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘ 9 } 10 11 12 def parse_url(page_url): 13 response = requests.get(page_url) 14 text = response.text 15 authors = re.findall(‘<div\sclass="author clearfix".*?<a.*?<h2>(.*?)</h2>.*?</a>‘, text, re.S) 16 dispose_authors = list(map(lambda data: data.strip(), authors)) 17 # print(dispose_author) 18 contents = re.findall(‘<div\sclass="content">.*?<span>(.*?)</span>‘, text, re.DOTALL) 19 20 dispose_contents = [] 21 for content in contents: 22 content = re.sub(‘<.*?>‘, "", content) 23 dispose_contents.append(content) 24 finally_contents = list(map(lambda data: data.strip(), dispose_contents)) 25 # print(finally_contents) 26 27 all_contents = [] 28 for data in zip(dispose_authors, finally_contents): 29 author, content = data 30 full_content = { 31 ‘author‘: author, 32 ‘content‘: content 33 } 34 all_contents.append(full_content) 35 print(all_contents) 36 37 38 def main(n): 39 url = ‘https://www.qiushibaike.com/text/page/{}/‘ 40 for i in range(1, n+1): 41 page_url = url.format(i) 42 parse_url(page_url) 43 print("============"*20, end=‘\n\n‘) 44 45 if __name__ == ‘__main__‘: 46 main(2)
原文:https://www.cnblogs.com/Jolly-hu/p/12227331.html