#爬取天气网中的全国城市天气查询页面 def getHTML(url): try: hs = {‘User-Agent‘: ‘Mozilla/5.0 ‘} #爬取该网页,若响应时间超过60则超时 r = requests.get(url,headers=hs,timeout=60) #判断是否成功,状态码不等于200即失败 r.raise_for_status() #防止出现中文乱码 r.encoding = r.apparent_encoding #返回爬取的所有代码 return r.text except: return‘无法爬取‘
#创建存储数据的文件 def dataSave(): try: #创建文件夹 os.mkdir("C:\天气网数据") except: #若文件已存在,则不再重复进行创建操作 "" #爬取各个城市名称 def getCityName(html): #创建用来存放的表 OldList = [] CityNameList = [] #设置对象 soup = BeautifulSoup(html,"html.paser") #提取城市标签 tab = soup.find("div",attrs="citybox") #用find_all()方法遍历得到所有城市的标签 a = tab.find_all("span") for b in range(len(a)): for x in a[b].find_all("a"): if x is not None: OldList.append(x.get("name")) #将得到的城市名称放在一个表上 CityNameList.append(OldList) #清理OldList表 OldList = [] #返回城市名称表 return CityNameList #设置城市的链接 def getCityHref(html): #创建2个存放城市信息的空表 oldList = [] cityHrefList = [] #设置对象 soup = BeautifulSoup(html, "html.parser") #提取城市 tab = soup.find("div",attrs="citybox") #用find_all()方法遍历得到所有城市的标签 a = tab.find_all("span") for d in range(len(a)): for x in a[d].find_all("a"): if x is not None: oldList.append(x.get("href")) #把得到得省包装在一个表上 cityHrefList.append(oldList) #清空表 oldList = [] #返回一个包含所有省市的列表 return cityHrefList #爬取各个城市温度 def getTemperature(html): soup = BeautifulSoup(html,"html.parser") a = soup.find("p",attrs="now") return a.text #爬取各个城市湿度 def getHumidity(html): soup = BeautifulSoup(html,"html.parser") b = soup.find("dd",attrs="shidu") a1 = b.b.text return a1 #爬取各个城市空气质量 def getAir(html): soup = BeautifulSoup(html,"html.parser") c = soup.find("dd",attrs="kongqi") return c.h5.text #爬取各个城市风向 def getWind(html): soup = BeautifulSoup(html,"html.parser") d=soup.find(‘dd‘,class_="shidu") return d.find_all("b")[1].text #测试爬取是否正常 if __name__ == "__main__": html = getHTML("http://www.tianqi.com/quanzhou/") print(getTemperature(html)) print(getWind(html)) print(getAir(html)) print(getHumidity(html))
运行结果:
#创建两个空表 list1 = [] list2 = [] #正常显示中文标签 plt.rcParams[‘font.sans-serif‘] = [‘SimHei‘] #正常显示负号 plt.rcParams[‘axes.unicode_minus‘] = False for x in cityHref[0][0:6]: html = getHTML("http://www.tianqi.com{}".format(x)) list1.append(getHumidity(html)) for j in list1: j=list2.append(j[3:5]) print(list2) numbers = [int(x) for x in list2] s = pd.Series(numbers[0:6],[‘海淀‘,‘朝阳‘,‘顺义‘,‘怀柔‘,‘通州‘,‘昌平‘]) #设置图表标题 s.plot(kind=‘bar‘, title=‘北京省前6个城市的湿度对比‘) #输出图片 plt.show()
运行结果:
#数据存储 def saveCityWeather(cityHref): #数据保存 dataSave() try: #存储爬取到的数据 for x in range(len(cityHref)): for a in range(len(cityHref[x])): with open("C:\\天气网数据\\天气信息.txt", "a") as f: html = getHTML("http://www.tianqi.com{}".format(cityHref[x][a])) f.write("{}天气\n".format(cityHref[x][a])) f.write("温度:{}\n".format(getTemperature(html))) f.write("{}\n".format(getWind(html))) f.write("{}\n".format(getHumidity(html))) f.write("-------\n") print("存储成功") except: "存储失败"
运行结果:
#导入所需的库 import requests from bs4 import BeautifulSoup import matplotlib.pyplot as plt import os import pandas as pd #爬取天气网中的全国城市天气查询页面 def getHTML(url): try: hs = {‘User-Agent‘: ‘Mozilla/5.0 ‘} #爬取该网页,若响应时间超过60则超时 r = requests.get(url,headers=hs,timeout=60) #判断是否成功,状态码不等于200即失败 r.raise_for_status() #防止出现中文乱码 r.encoding = r.apparent_encoding #返回爬取的所有代码 return r.text except: return‘无法爬取‘ #创建存储数据的文件 def dataSave(): try: #创建文件夹 os.mkdir("C:\天气网数据") except: #若文件已存在,则不再重复进行创建操作 "" #爬取各个城市名称 def getCityName(html): #创建用来存放的表 OldList = [] CityNameList = [] #设置对象 soup = BeautifulSoup(html,"html.paser") #提取城市标签 tab = soup.find("div",attrs="citybox") #用find_all()方法遍历得到所有城市的标签 a = tab.find_all("span") for b in range(len(a)): for x in a[b].find_all("a"): if x is not None: OldList.append(x.get("name")) #将得到的城市名称放在一个表上 CityNameList.append(OldList) #清理OldList表 OldList = [] #返回城市名称表 return CityNameList #设置城市的链接 def getCityHref(html): #创建2个存放城市信息的空表 oldList = [] cityHrefList = [] #设置对象 soup = BeautifulSoup(html, "html.parser") #提取城市 tab = soup.find("div",attrs="citybox") #用find_all()方法遍历得到所有城市的标签 a = tab.find_all("span") for d in range(len(a)): for x in a[d].find_all("a"): if x is not None: oldList.append(x.get("href")) #把得到得省包装在一个表上 cityHrefList.append(oldList) #清空表 oldList = [] #返回一个包含所有省市的列表 return cityHrefList #爬取各个城市温度 def getTemperature(html): soup = BeautifulSoup(html,"html.parser") a = soup.find("p",attrs="now") return a.text #爬取各个城市湿度 def getHumidity(html): soup = BeautifulSoup(html,"html.parser") b = soup.find("dd",attrs="shidu") a1 = b.b.text return a1 #爬取各个城市空气质量 def getAir(html): soup = BeautifulSoup(html,"html.parser") c = soup.find("dd",attrs="kongqi") return c.h5.text #爬取各个城市风向 def getWind(html): soup = BeautifulSoup(html,"html.parser") d=soup.find(‘dd‘,class_="shidu") return d.find_all("b")[1].text #测试爬取是否正常 if __name__ == "__main__": html = getHTML("http://www.tianqi.com/quanzhou/") print(getTemperature(html)) print(getWind(html)) print(getAir(html)) print(getHumidity(html)) #数据存储 def saveCityWeather(cityHref): #数据保存 dataSave() try: #存储爬取到的数据 for x in range(len(cityHref)): for a in range(len(cityHref[x])): with open("C:\\天气网数据\\天气信息.txt", "a") as f: html = getHTML("http://www.tianqi.com{}".format(cityHref[x][a])) f.write("{}天气\n".format(cityHref[x][a])) f.write("温度:{}\n".format(getTemperature(html))) f.write("{}\n".format(getWind(html))) f.write("{}\n".format(getHumidity(html))) f.write("-------\n") print("存储成功") except: "存储失败" #函数调用 if __name__ == "__main__": html = getHTML("http://www.tianqi.com/quanzhou/") html2 = getHTML("http://www.tianqi.com/chinacity.html") cityHref = getCityHref(html2) dataSave() saveCityWeather(cityHref) ##所有城市天气控制台输出 for x in range(len(cityHref)): for a in range(len(cityHref[x])): html = getHTML("http://www.tianqi.com{}".format(cityHref[x][a])) print(cityHref[x][a]+"天气") print(getWind(html)) print(getAir(html)) print(getHumidity(html)) print("-------\n") #保存数据 #数据分析与可视化 #创建两个空表 list1 = [] list2 = [] #正常显示中文标签 plt.rcParams[‘font.sans-serif‘] = [‘SimHei‘] #正常显示负号 plt.rcParams[‘axes.unicode_minus‘] = False for x in cityHref[0][0:6]: html = getHTML("http://www.tianqi.com{}".format(x)) list1.append(getHumidity(html)) for j in list1: j=list2.append(j[3:5]) print(list2) numbers = [int(x) for x in list2] s = pd.Series(numbers[0:6],[‘海淀‘,‘朝阳‘,‘顺义‘,‘怀柔‘,‘通州‘,‘昌平‘]) #设置图表标题 s.plot(kind=‘bar‘, title=‘北京省前6个城市的湿度对比‘) #输出图片 plt.show()
原文:https://www.cnblogs.com/lw0900/p/12072377.html