新浪头条新闻url。
2.Htmls页面解析import requests import os import re #爬取页面 def getHTMLText(url): try: #假装成浏览器访问 kv = {‘Cookie‘:‘SINAGLOBAL=4844987765259.994.1544506324942; SUB=_2AkMqmKIaf8NxqwJRmPoVxWnmaIV-ygDEieKcxFPBJRMxHRl-yT9jqmc8tRB6ARiM9rPSLjsy2kCgBq61u7x2M9eTeKTA; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFYIzVSU-rQ8YIqH5sJ2vs7; login_sid_t=6f2f5ed24c4e1f2de505c160ca489c97; cross_origin_proto=SSL; _s_tentry=www.baidu.com; UOR=,,www.baidu.com; Apache=9862472971727.955.1575730782698; ULV=1575730782710:6:1:1:9862472971727.955.1575730782698:1569219490864; YF-Page-G0=b7e3c62ec2c0b957a92ff634c16e7b3f|1575731639|1575731637‘, ‘user-agent‘:‘Mozilla/5.0‘, ‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3‘} #获取目标页面 r = requests.get(url,headers = kv) #判断页面是否链接成功 r.raise_for_status() #使用HTML页面内容中分析出的响应内容编码方式 #r.encoding = r.apparent_encoding #返回页面内容 return r.text except: #如果爬取失败,返回“爬取失败” return "爬取失败" #爬取数据 def getData(nlist,plist,html): #爬取新浪头条新闻标题 flag1 = re.findall("class=.{0,3}S_txt1.{0,3}target=.{0,3}_blank.{0,3}>.{1,40}<.{0,3}a><.{0,3}h3>",html) #对标题进行清洗 for i in range(8): nlist.append(flag1[i][35:-11]) #爬取新闻来源 flag2 = re.findall(‘<span.{0,3}class=.{0,3}subinfo S_txt2.{0,3}>.{0,10}<.{0,3}span>‘,html) #对新闻来源进行清洗 for i in range(len(flag2)): plist.append(flag2[i][31:-8]) return nlist,plist #打印结果 def printList(nlist,plist,num): flag1 = [i for i in plist[0::2]] flag2 = [i for i in plist[1::2]] for i in range(num): print("````````````````````````````````````````````````````````````````````````````") print("标题:{}".format(nlist[i])) print("新闻来源:{}".format(flag1[i])) print("发布时间:{}".format(flag2[i])) #数据存储 def dataSave(nlist,plist,num): flag1 = [i for i in plist[0::2]] flag2 = [i for i in plist[1::2]] try: #创建文件夹 os.mkdir("C:\新浪头条新闻") except: #如果文件夹存在则什么也不做 "" try: #创建文件用于存储爬取到的数据 with open("C:\\新浪头条新闻\\新浪头条新闻.txt","w") as f: for i in range(num): f.write("````````````````````````````````````````````````````````````````````````````\n") f.write("标题:{}\n".format(nlist[i])) f.write("新闻来源:{}\n".format(flag1[i])) f.write("发布时间:{}\n".format(flag2[i])) except: "存储失败" nlist = [] plist = [] #新浪头条新闻链接 url = "https://weibo.com/?category=1760" #获取HTML页面 html = getHTMLText(url) #将数据存在列表中 getData(nlist,plist,html) #打印数据 printList(nlist,plist,8) #存储数据 dataSave(nlist,plist,8)
#爬取页面 def getHTMLText(url): try: #假装成浏览器访问 kv = {‘Cookie‘:‘SINAGLOBAL=4844987765259.994.1544506324942; SUB=_2AkMqmKIaf8NxqwJRmPoVxWnmaIV-ygDEieKcxFPBJRMxHRl-yT9jqmc8tRB6ARiM9rPSLjsy2kCgBq61u7x2M9eTeKTA; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFYIzVSU-rQ8YIqH5sJ2vs7; login_sid_t=6f2f5ed24c4e1f2de505c160ca489c97; cross_origin_proto=SSL; _s_tentry=www.baidu.com; UOR=,,www.baidu.com; Apache=9862472971727.955.1575730782698; ULV=1575730782710:6:1:1:9862472971727.955.1575730782698:1569219490864; YF-Page-G0=b7e3c62ec2c0b957a92ff634c16e7b3f|1575731639|1575731637‘, ‘user-agent‘:‘Mozilla/5.0‘, ‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3‘} #获取目标页面 r = requests.get(url,headers = kv) #判断页面是否链接成功 r.raise_for_status() #使用HTML页面内容中分析出的响应内容编码方式 #r.encoding = r.apparent_encoding #返回页面内容 return r.text except: #如果爬取失败,返回“爬取失败” return "爬取失败"
def getData(nlist,plist,html): #爬取新浪头条新闻标题 flag1 = re.findall("class=.{0,3}S_txt1.{0,3}target=.{0,3}_blank.{0,3}>.{1,40}<.{0,3}a><.{0,3}h3>",html) #对标题进行清洗 for i in range(8): nlist.append(flag1[i][35:-11]) #爬取新闻来源 flag2 = re.findall(‘<span.{0,3}class=.{0,3}subinfo S_txt2.{0,3}>.{0,10}<.{0,3}span>‘,html) #对新闻来源进行清洗 for i in range(len(flag2)): plist.append(flag2[i][31:-8]) return nlist,plist
#数据存储 def dataSave(nlist,plist,num): flag1 = [i for i in plist[0::2]] flag2 = [i for i in plist[1::2]] try: #创建文件夹 os.mkdir("C:\新浪头条新闻") except: #如果文件夹存在则什么也不做 "" try: #创建文件用于存储爬取到的数据 with open("C:\\新浪头条新闻\\新浪头条新闻.txt","w") as f: for i in range(num): f.write("````````````````````````````````````````````````````````````````````````````\n") f.write("标题:{}\n".format(nlist[i])) f.write("新闻来源:{}\n".format(flag1[i])) f.write("发布时间:{}\n".format(flag2[i])) except: "存储失败"
原文:https://www.cnblogs.com/shishuo/p/12019553.html