import requests,json
import time
from fake_useragent import UserAgent
import re
ua = UserAgent()
class IfengSpider:
def __init__(self):
self.urls = [
‘https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/6579178829179269120/1568598863000/2000/3-35199-‘, # 台湾
‘https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/6572044442780307492/1566891029000/2000/3-35190-‘, # 暖新闻
‘https://shankapi.ifeng.com/autumn/xuanzhan/index/getCustomNewsTfList/20/2000‘ # 宣战2020
‘https://shankapi.ifeng.com/autumn/xijinping/index/getCustomNewsTfList/0/2000‘ # 新气象
]
self.warm={} #存储暖新闻
self.tai={} #存储台湾
self.xuan={} #存储宣战2020
self.xin={} #存储新气象
self.data = {} #存储所有数据
def newSpider_tai(self,num): #获取台湾获页面里的数据
url = ‘https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/6579178829179269120/1568598863000/%s/3-35199-‘%num
res = requests.get(url)
ss = res.json()[‘data‘]["newsstream"]
# print(ss)
for i in range(0,len(ss)):
self.tai[‘title‘]=ss[i][‘title‘]
self.tai[‘url‘]=ss[i][‘url‘]
self.tai[‘source‘]=ss[i][‘source‘]
self.tai[‘newsTime‘]=ss[i][‘newsTime‘]
urls = ss[i][‘url‘]
res =requests.get(urls)
time.sleep(3)
# print(res.text)
data = re.findall(‘var allData = (.+?});‘, res.text)
# data = re.findall("var allData = (.+?});", res.text)
data = json.loads(data[0])["docData"]
self.tai[‘con‘] = data
self.data[f‘tai{i+1}‘]=self.tai
print("《%s》下载完成..." % ss[i][‘title‘])
def newSpider_warm(self,num): #获取暖心页面里的数据
url = ‘https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/6572044442780307492/1566891029000/%s/3-35190-‘%num
res = requests.get(url)
ss = res.json()[‘data‘]["newsstream"]
# print(ss)
for i in range(0,len(ss)):
self.warm[‘title‘]=ss[i][‘title‘]
self.warm[‘url‘]=ss[i][‘url‘]
self.warm[‘source‘]=ss[i][‘source‘]
self.warm[‘newsTime‘]=ss[i][‘newsTime‘]
urls = ss[i][‘url‘]
res =requests.get(urls)
time.sleep(3)
# print(res.text)
data = re.findall(‘var allData = (.+?});‘, res.text)
# data = re.findall("var allData = (.+?});", res.text)
data = json.loads(data[0])["docData"]
self.warm[‘con‘] = data
self.data[f‘warm{i+1}‘]=self.warm
print("《%s》下载完成..." % ss[i][‘title‘])
def newSpider_xuan(self, num): #获取宣战页面里的数据
url = ‘https://shankapi.ifeng.com/autumn/xuanzhan/index/getCustomNewsTfList/20/%s‘ % num
res = requests.get(url)
ss = res.json()[‘data‘]["newsstream"]
# print(ss)
for i in range(0, len(ss)):
self.xuan[‘title‘] = ss[i][‘title‘]
self.xuan[‘url‘] = ss[i][‘url‘]
self.xuan[‘source‘] = ss[i][‘source‘]
self.xuan[‘newsTime‘] = ss[i][‘newsTime‘]
self.xuan[‘editorName‘] = ss[i][‘editorName‘]
urls = ss[i][‘url‘]
res = requests.get(urls)
time.sleep(3)
# print(res.text)
data = re.findall(‘var allData = (.+?});‘, res.text)
# data = re.findall("var allData = (.+?});", res.text)
data = json.loads(data[0])["docData"]
self.xuan[‘con‘] = data
self.data[f‘xuan{i+1}‘] = self.xuan
print("《%s》下载完成..." % ss[i][‘title‘])
def newSpider_xin(self, num): #获取新气象页面里的数据
url = ‘https://shankapi.ifeng.com/autumn/xijinping/index/getCustomNewsTfList/0/%s‘ % num
res = requests.get(url)
ss = res.json()[‘data‘]["newsstream"]
# print(ss)
for i in range(0, len(ss)):
self.xin[‘title‘] = ss[i][‘title‘]
self.xin[‘url‘] = ss[i][‘url‘]
self.xin[‘source‘] = ss[i][‘source‘]
self.xin[‘newsTime‘] = ss[i][‘newsTime‘]
self.xin[‘editorName‘] = ss[i][‘editorName‘]
urls = ss[i][‘url‘]
res = requests.get(urls)
time.sleep(3)
# print(res.text)
data = re.findall(‘var allData = (.+?});‘, res.text)
# data = re.findall("var allData = (.+?});", res.text)
data = json.loads(data[0])["docData"]
self.xin[‘con‘] = data
self.data[f‘xin{i+1}‘] = self.xin
print("《%s》下载完成..." % ss[i][‘title‘])
s = IfengSpider()
s.newSpider_warm(2)
s.newSpider_xuan(2)
s.newSpider_tai(2)
s.newSpider_xin(2)
原文:https://www.cnblogs.com/superSmall/p/11529410.html