首页 > Web开发 > 详细

爬取凤凰网站资讯类的新闻

时间:2019-09-16 20:42:37      阅读:236      评论:0      收藏:0      [点我收藏+]
import requests,json
import time
from fake_useragent import UserAgent
import re


ua = UserAgent()
class IfengSpider:
def __init__(self):
self.urls = [
‘https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/6579178829179269120/1568598863000/2000/3-35199-‘, # 台湾
‘https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/6572044442780307492/1566891029000/2000/3-35190-‘, # 暖新闻
‘https://shankapi.ifeng.com/autumn/xuanzhan/index/getCustomNewsTfList/20/2000‘ # 宣战2020
‘https://shankapi.ifeng.com/autumn/xijinping/index/getCustomNewsTfList/0/2000‘ # 新气象
]
self.warm={} #存储暖新闻
self.tai={} #存储台湾
self.xuan={} #存储宣战2020
self.xin={} #存储新气象
self.data = {} #存储所有数据


def newSpider_tai(self,num): #获取台湾获页面里的数据
url = ‘https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/6579178829179269120/1568598863000/%s/3-35199-‘%num
res = requests.get(url)
ss = res.json()[‘data‘]["newsstream"]
# print(ss)
for i in range(0,len(ss)):
self.tai[‘title‘]=ss[i][‘title‘]
self.tai[‘url‘]=ss[i][‘url‘]
self.tai[‘source‘]=ss[i][‘source‘]
self.tai[‘newsTime‘]=ss[i][‘newsTime‘]
urls = ss[i][‘url‘]
res =requests.get(urls)
time.sleep(3)
# print(res.text)
data = re.findall(‘var allData = (.+?});‘, res.text)
# data = re.findall("var allData = (.+?});", res.text)
data = json.loads(data[0])["docData"]
self.tai[‘con‘] = data
self.data[f‘tai{i+1}‘]=self.tai
print("《%s》下载完成..." % ss[i][‘title‘])



def newSpider_warm(self,num): #获取暖心页面里的数据
url = ‘https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/6572044442780307492/1566891029000/%s/3-35190-‘%num
res = requests.get(url)
ss = res.json()[‘data‘]["newsstream"]
# print(ss)
for i in range(0,len(ss)):
self.warm[‘title‘]=ss[i][‘title‘]
self.warm[‘url‘]=ss[i][‘url‘]
self.warm[‘source‘]=ss[i][‘source‘]
self.warm[‘newsTime‘]=ss[i][‘newsTime‘]
urls = ss[i][‘url‘]
res =requests.get(urls)
time.sleep(3)
# print(res.text)
data = re.findall(‘var allData = (.+?});‘, res.text)
# data = re.findall("var allData = (.+?});", res.text)
data = json.loads(data[0])["docData"]
self.warm[‘con‘] = data
self.data[f‘warm{i+1}‘]=self.warm
print("《%s》下载完成..." % ss[i][‘title‘])



def newSpider_xuan(self, num): #获取宣战页面里的数据
url = ‘https://shankapi.ifeng.com/autumn/xuanzhan/index/getCustomNewsTfList/20/%s‘ % num
res = requests.get(url)
ss = res.json()[‘data‘]["newsstream"]
# print(ss)
for i in range(0, len(ss)):
self.xuan[‘title‘] = ss[i][‘title‘]
self.xuan[‘url‘] = ss[i][‘url‘]
self.xuan[‘source‘] = ss[i][‘source‘]
self.xuan[‘newsTime‘] = ss[i][‘newsTime‘]
self.xuan[‘editorName‘] = ss[i][‘editorName‘]
urls = ss[i][‘url‘]
res = requests.get(urls)
time.sleep(3)
# print(res.text)
data = re.findall(‘var allData = (.+?});‘, res.text)
# data = re.findall("var allData = (.+?});", res.text)
data = json.loads(data[0])["docData"]
self.xuan[‘con‘] = data
self.data[f‘xuan{i+1}‘] = self.xuan
print("《%s》下载完成..." % ss[i][‘title‘])



def newSpider_xin(self, num): #获取新气象页面里的数据
url = ‘https://shankapi.ifeng.com/autumn/xijinping/index/getCustomNewsTfList/0/%s‘ % num
res = requests.get(url)
ss = res.json()[‘data‘]["newsstream"]
# print(ss)
for i in range(0, len(ss)):
self.xin[‘title‘] = ss[i][‘title‘]
self.xin[‘url‘] = ss[i][‘url‘]
self.xin[‘source‘] = ss[i][‘source‘]
self.xin[‘newsTime‘] = ss[i][‘newsTime‘]
self.xin[‘editorName‘] = ss[i][‘editorName‘]
urls = ss[i][‘url‘]
res = requests.get(urls)
time.sleep(3)
# print(res.text)
data = re.findall(‘var allData = (.+?});‘, res.text)
# data = re.findall("var allData = (.+?});", res.text)
data = json.loads(data[0])["docData"]
self.xin[‘con‘] = data
self.data[f‘xin{i+1}‘] = self.xin
print("《%s》下载完成..." % ss[i][‘title‘])

s = IfengSpider()
s.newSpider_warm(2)
s.newSpider_xuan(2)
s.newSpider_tai(2)
s.newSpider_xin(2)

爬取凤凰网站资讯类的新闻

原文:https://www.cnblogs.com/superSmall/p/11529410.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!