# coding=UTF-8 import requests import json import bs4 import re import os #判断文件是否存在 def is_file_path(path): if os.path.isfile(path): return 0 else: return 1 #判断文件目录是否存在 def is_folder_path(ymd): path = os.path.join(r‘D:\news‘, ‘%s‘ % (ymd)) # print(path) if not os.path.exists(path): os.makedirs(path) return path #解析页面 def handle_html(html): bsobj = bs4.BeautifulSoup(html,‘html.parser‘) title_ = bsobj.find(‘h1‘, attrs={‘class‘: ‘art_tit_h1‘}) title = re.findall(r‘\>(.*?)\<‘,str(title_))[0] release_time_ = bsobj.find(‘time‘,attrs={‘class‘:‘art_time‘}) release_time = re.findall(r‘\>(.*?)\<‘,str(release_time_))[0] content_list = bsobj.find(‘article‘,attrs={‘class‘:‘art_box‘}).find_all(‘p‘) ymd = release_time.split(‘ ‘)[0] dhs = release_time.split(‘ ‘)[1].replace(‘:‘,‘-‘) path = is_folder_path(ymd) #组合路径 file_path = os.path.join(path,‘%s.txt‘ % (dhs)) limit_number = is_file_path(file_path) if limit_number == 1: with open(file_path,‘a‘,encoding=‘utf-8‘)as file: file.write(str(title)+‘\n‘) file.write(str(release_time)+‘\n‘) for item in content_list: content = re.findall(r‘\>(.*?)\<‘,str(item))[0] #写入文件 with open(file_path,‘a‘,encoding=‘utf-8‘)as file: file.write(str(content)+‘\n‘) else: print(‘%s.txt‘ % (dhs),"文件已存在") print() #根据新闻url访问具体的网页并提取信息 def news_url_info(url): """ 功能:访问单一路由的新闻页面 参数:目标网页的url 返回:目标网页的html """ url = url.replace(‘\\‘,‘‘) r = requests.get(url) r.raise_for_status() # print(r.text) handle_html(r.text) # t = threading.Thread(target=handle_html, args=(r.text)) # t.setDaemon(True) # 把子进程设置为守护线程,必须在start()之前设置 # t.start() # t.join() # 设置主线程等待子线程结束 # print(url) #初次提取结构数据:提取所需数据 def handle_one_information(data): """ 功能:提取字典结构的数据 参数:目标网页的数据 返回:处理后的数据字典 """ data = data.split(‘(‘)[1].split(‘)‘)[0] return data #二次提取数据:提取新闻链接的列表 def handle_two_information(information): """ 功能:提取新闻url的列表 参数:初次处理后的数据结构 返回: """ news_list = information[‘result‘].get(‘data‘) for item in news_list[‘list‘]: url = item[‘URL‘] news_url_info(url) def start_url(page,count): """ 功能:访问新闻url的集合网页 参数:目标的url 返回: """ url = ‘http://interface.sina.cn/dfz/outside/wap/news/list.d.html?col=56325&level=undefined&show_num=15&page={}&act=more&jsoncallback=callbackFunction&callback=jsonp{}‘.format(page,count) header = { ‘Accept‘:‘*/*‘, ‘Accept-Encoding‘:‘gzip, deflate‘, ‘Accept-Language‘:‘zh-CN,zh;q=0.9‘, ‘Connection‘:‘keep-alive‘, ‘Cookie‘:‘ustat=__10.79.112.91_1600136573_0.19598300; genTime=1600136573; vt=4; Apache=7531690419683.026.1600136575343; SINAGLOBAL=7531690419683.026.1600136575343; ULV=1600136575344:1:1:1:7531690419683.026.1600136575343:; historyRecord={"href":"http://sz.sina.cn/news/list-p1.d.html","refer":"http://sz.sina.com.cn/"}‘, ‘Host‘:‘interface.sina.cn‘, ‘Referer‘:‘http://sz.sina.cn/news/list-p1.d.html‘, ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36‘ } res = requests.get(url, headers=header) data = res.text.encode(‘utf-8‘).decode(‘unicode_escape‘) information = json.loads(handle_one_information(data)) handle_two_information(information) if __name__ == ‘__main__‘: """ 循环界限:取决于所需数据 """ for i in range(1,100000000): start_url(i, i)
原文:https://www.cnblogs.com/I-love-Xiang/p/13673974.html