首页 > 编程语言 > 详细

python 新浪微博新闻抓取

时间:2020-09-15 18:09:10      阅读:65      评论:0      收藏:0      [点我收藏+]
# coding=UTF-8
import requests
import json
import bs4
import re
import os

#判断文件是否存在
def is_file_path(path):
    if os.path.isfile(path):
        return 0
    else:
        return 1
#判断文件目录是否存在
def is_folder_path(ymd):
    path = os.path.join(rD:\news, %s % (ymd))
    # print(path)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

#解析页面
def handle_html(html):
    bsobj = bs4.BeautifulSoup(html,html.parser)
    title_ = bsobj.find(h1, attrs={class: art_tit_h1})
    title = re.findall(r\>(.*?)\<,str(title_))[0]
    release_time_ = bsobj.find(time,attrs={class:art_time})
    release_time = re.findall(r\>(.*?)\<,str(release_time_))[0]
    content_list = bsobj.find(article,attrs={class:art_box}).find_all(p)
    ymd = release_time.split( )[0]
    dhs = release_time.split( )[1].replace(:,-)
    path = is_folder_path(ymd)
    #组合路径
    file_path = os.path.join(path,%s.txt % (dhs))
    limit_number = is_file_path(file_path)
    if limit_number == 1:
        with open(file_path,a,encoding=utf-8)as file:
            file.write(str(title)+\n)
            file.write(str(release_time)+\n)
        for item in content_list:
            content = re.findall(r\>(.*?)\<,str(item))[0]
            #写入文件
            with open(file_path,a,encoding=utf-8)as file:
                file.write(str(content)+\n)
    else:
        print(%s.txt % (dhs),"文件已存在")
    print()
#根据新闻url访问具体的网页并提取信息
def news_url_info(url):
    """
    功能:访问单一路由的新闻页面
    参数:目标网页的url
    返回:目标网页的html
    """
    url = url.replace(\\,‘‘)
    r = requests.get(url)
    r.raise_for_status()
    # print(r.text)
    handle_html(r.text)
    # t = threading.Thread(target=handle_html, args=(r.text))
    # t.setDaemon(True)  # 把子进程设置为守护线程,必须在start()之前设置
    # t.start()
    # t.join()  # 设置主线程等待子线程结束
    # print(url)

#初次提取结构数据:提取所需数据
def handle_one_information(data):
    """
    功能:提取字典结构的数据
    参数:目标网页的数据
    返回:处理后的数据字典
    """
    data = data.split(()[1].split())[0]
    return data
#二次提取数据:提取新闻链接的列表
def handle_two_information(information):
    """
    功能:提取新闻url的列表
    参数:初次处理后的数据结构
    返回:
    """
    news_list = information[result].get(data)
    for item in news_list[list]:
        url = item[URL]
        news_url_info(url)

def start_url(page,count):
    """
    功能:访问新闻url的集合网页
    参数:目标的url
    返回:
    """
    url = http://interface.sina.cn/dfz/outside/wap/news/list.d.html?col=56325&level=undefined&show_num=15&page={}&act=more&jsoncallback=callbackFunction&callback=jsonp{}.format(page,count)
    header = {
        Accept:*/*,
        Accept-Encoding:gzip, deflate,
        Accept-Language:zh-CN,zh;q=0.9,
        Connection:keep-alive,
        Cookie:ustat=__10.79.112.91_1600136573_0.19598300; genTime=1600136573; vt=4; Apache=7531690419683.026.1600136575343; SINAGLOBAL=7531690419683.026.1600136575343; ULV=1600136575344:1:1:1:7531690419683.026.1600136575343:; historyRecord={"href":"http://sz.sina.cn/news/list-p1.d.html","refer":"http://sz.sina.com.cn/"},
        Host:interface.sina.cn,
        Referer:http://sz.sina.cn/news/list-p1.d.html,
        User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36
    }
    res = requests.get(url, headers=header)
    data = res.text.encode(utf-8).decode(unicode_escape)
    information = json.loads(handle_one_information(data))
    handle_two_information(information)

if __name__ == __main__:
    """
    循环界限:取决于所需数据
    """
    for i in range(1,100000000):
        start_url(i, i)

 

python 新浪微博新闻抓取

原文:https://www.cnblogs.com/I-love-Xiang/p/13673974.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!