从大众点评网上爬取泉州美食html页面数据,包括‘店名‘, ‘点评数‘, ‘花费‘, ‘菜系‘, ‘地点‘, ‘口味‘, ‘环境‘, ‘服务‘等分析。
import requests from bs4 import BeautifulSoup url = ‘http://www.dianping.com/quanzhou/ch10‘ def getHTMLText(url,timeout=30): try: r=requests.get(url,timeout=30) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except: return ‘产生异常‘ html = getHTMLText(url) soup=BeautifulSoup(html,‘html.parser‘) print(soup.prettify())
#导入库 import requests from bs4 import BeautifulSoup import csv import os # 链接url def gethtml(num): try: number = num + 1 print(‘{:<2d}{:<}{:<}‘.format(number,‘页‘,‘:‘))#打印正在爬取的页数 url = ‘https://www.dianping.com/quanzhou/ch0‘ + str(num) r = requests.get(url) r.raise_for_status() r.encoding = ‘utf-8‘#转码 return r.text except Exception as e: print(e) return ‘‘ # 爬取数据 def findhtml(text, ul): soup = BeautifulSoup(text, ‘lxml‘) links = soup.find_all(‘li‘, class_=‘‘) for link in links: ui = [] if link.h4 != None:#爬取店铺名 ui.append(link.h4.string) print(‘{:^50s}‘.format(link.h4.string))#打印店铺名 a1 = link.find(‘a‘, class_=‘review-num‘)#爬取点评数 if a1: ui.append(a1.b.string) else: ui.append(‘ ‘) a2 = link.find(‘a‘, class_=‘mean-price‘)#爬取花费 try: if a2: ui.append(a2.b.string) else: ui.append(‘ ‘) except: ui.append(‘‘) a3 = link.find(‘a‘, {‘data-midas-extends‘: ‘module=5_ad_kwcat‘})#爬取菜系 if a3: ui.append(a3.string) else: ui.append(‘ ‘) a4 = link.find(‘a‘, {‘data-midas-extends‘: ‘module=5_ad_kwregion‘})#爬取口味,环境,服务 span1 = link.find(‘span‘, {‘class‘: ‘addr‘}) if a4 and span1: ui.append(a4.string + ‘ ‘ + span1.string) elif a4 == None and span1 != None: ui.append(span1.string) elif a4 != None and span1 == None: ui.append(a4.string) else: ui.append(‘ ‘) try: spans = link.find(‘span‘, class_=‘comment-list‘) spanss = spans.contents ui.append(spanss[1].b.string) ui.append(spanss[3].b.string) ui.append(spanss[5].b.string) except: ui.append(‘‘) ui.append(‘‘) ui.append(‘‘) ul.append(ui) # 保存数据,路径D盘 def savehtml(uls): path = ‘D://数据‘ if not os.path.exists(path): os.makedirs(path) with open(os.path.join(path, ‘大众点评南京美食.csv‘),‘w+‘) as f: writer = csv.writer(f) writer.writerow([‘店名‘, ‘点评数‘, ‘菜系‘, ‘地点‘, ‘口味‘, ‘环境‘, ‘服务‘,‘人流量‘,‘人均花费‘]) for i in range(len(uls)): try: if uls[i]: writer.writerow( [uls[i][0], uls[i][1], uls[i][2], uls[i][3], uls[i][4], uls[i][5], uls[i][6], uls[i][7]])#写入csv文件 except: continue # main() def main(i): ulist = [] it = int(i) for number in range(it): html = gethtml(number) findhtml(html, ulist) savehtml(ulist) yeshu = input(‘输入要查询的总页数(1~50):‘) main(yeshu)
#导入数据集 import pandas as pd data=pd.DataFrame(pd.read_excel(‘D://数据.xls‘)) #获取目标信息 def getData(titleList,nameList,numList,html): #创建BeautifulSoup对象 soup = BeautifulSoup(html,"html.parser") #获取标题信息 for a in soup.find_all("a",{"class":"title"}): #将标题信息存在列表中 titleList.append(a.string) for i in soup.find_all("i",{"class":"acnick"}): nameList.append(i.string) for i in soup.find_all("i",{"class":"js-num"}): List.append(i.string) 数据.drop(‘点评数‘,axis=1,inplace=True) #删除无效列点评数 数据.head()#显示前五行 数据.drop(‘点赞数‘,axis=1,inplace=True) #删除无效列点赞数 数据.head()#显示前五行 数据.describe()
#饼图 import matplotlib.pyplot as plt Labels = [‘日本菜‘, ‘西餐‘, ‘海鲜‘, ‘火锅‘] Data = [356,92,45,14,121,194] #绘制饼图 plt.pie(Data ,labels=Type, autopct=‘%1.1f%%‘) plt.axis(aspect=‘equal‘) #将横、纵坐标轴标准化处理,设置显示图像为正圆 plt.rcParams[‘font.sans-serif‘]=[‘SimHei‘] #设置字体样式 plt.title(‘菜系分布‘) plt.show()
import requests from bs4 import BeautifulSoup import os url = "http://www.dianping.com/quanzhou/ch10" html = getHTMLText(url) #获取html页面代码 filelist = [] #存储文件名 getFile(html,filelist) urllist = [] getURL(html,urllist) def getHTMLText(url): #请求url链接 try: r = requests.get(url,timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "产生异常" def getFile(html,filelist): #获取文件名 soup = BeautifulSoup(html,"html.parser") for p in soup.find_all("p"): #遍历所有属性为p的p标签 for a in p.find_all("a"): #遍历p标签中的a标签 return urllist def dataStore(file,name): #数据存储 try: os.mkdir("D:\数据表") except: "" try: with open("D:\\数据表\\{}.xls".format(name),"wb") as fp:#创建文件存储爬取到的数据 fp.write(file.content) print("下载成功") except: "存储失败" for i in range(0,len(urllist)):
#将目标信息存储在本地 file=requests.get(urllist[i]) dataStore(file,filelist[i])
原文:https://www.cnblogs.com/tuying/p/12098791.html