首页 > 编程语言 > 详细

python3-对拉钩网数据爬取及简单的数据分析

时间:2020-01-07 16:10:27      阅读:99      评论:0      收藏:0      [点我收藏+]
#encoding:utf-8
import requests
import json, os
import matplotlib.pyplot as plt

class LaGouAnsialy():
def __init__(self):
self.headers = {"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"Content-Length": "25",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "JSESSIONID=ABAAABAABEEAAJA3893CB27253239CD99CA00B5B714A93D; WEBTJ-ID=20200102153439-16f652d85a5430-089bd0ca7e1d01-3a65420e-2073600-16f652d85a69d0; _ga=GA1.2.1705075091.1577950480; _gid=GA1.2.613899177.1577950480; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1577950480; user_trace_token=20200102153440-55dd897c-2d32-11ea-b0f6-525400f775ce; LGUID=20200102153440-55dd8ebd-2d32-11ea-b0f6-525400f775ce; TG-TRACK-CODE=index_search; X_MIDDLE_TOKEN=7064071e9d874446822efc2a3b85cc31; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216f65612f75770-0a06d44f49f747-3a65420e-2073600-16f65612f769d9%22%2C%22%24device_id%22%3A%2216f65612f75770-0a06d44f49f747-3a65420e-2073600-16f65612f769d9%22%7D; index_location_city=%E6%B7%B1%E5%9C%B3; X_HTTP_TOKEN=eba94a1ed2839078190020875166bed98c35c75552; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1578020091; LGSID=20200103105451-69586484-2dd4-11ea-a70a-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Futrack%2FtrackMid.html%3Ff%3Dhttps%253A%252F%252Fwww.lagou.com%252Fjobs%252Flist%255Fpython%252Fp-city%255F215%253Fpx%253Ddefault%26t%3D1578020088%26_ti%3D2; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%2Fp-city_215%3Fpx%3Ddefault; LGRID=20200103105451-695865ff-2dd4-11ea-a70a-5254005c3644; SEARCH_ID=51bf9f11cda9465093fe6165ce281bb2",
"Host": "www.lagou.com",
"Origin": "https://www.lagou.com",
"Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
"X-Anit-Forge-Code": "0",
"X-Anit-Forge-Token": "None",
"X-Requested-With": "XMLHttpRequest"}
self.request_url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
self.search = "python"
self.city = "深圳"
self.datas = []

def send_request(self, page):
param = {"needAddtionalResult": False, "city": self.city, "px": "default"}
data = {"first": True, "pn": page, "kd": self.search}
resp = requests.post(url=self.request_url, params=param, data=data, headers=self.headers)
return resp

def ansaly_data(self):
for page in range(1, 50):
resp = self.send_request(page)
if len(resp.json()["content"]["positionResult"]["result"]) == 0:
break
for position in resp.json()["content"]["positionResult"]["result"]:
position_dict = {
‘学历‘: position[‘education‘],
‘薪水‘: position[‘salary‘],
‘工作经验‘: position[‘workYear‘],
}
self.datas.append(position_dict)
time.sleep(20)
line = json.dumps(self.datas, ensure_ascii=False)
with open("lagou.json", "w") as f:
f.write(line)
print("解析完成")

def create_report(self):
plt.rcParams[‘font.sans-serif‘] = [‘SimHei‘] # 设置字体,解决中文乱码问题
result = pd.read_json("lagou.json")
datas = result.薪水.value_counts()
indexs = [index for index in datas.index]
values = [value for value in datas.values]
dicts = dict(zip(indexs, values))
s8 = 0 # 小于8k
s8_12 = 0 # 8-12k
s12_15 = 0 # 12-15k
s15_18 = 0 # 15-18k
s18_22 = 0 # 18-22k
s22_30 = 0 # 22-30k
l30 = 0 # 大于30k
for kbs, val in dicts.items():
a2 = int(kbs.split("-")[1].split("k")[0])
if a2 <= 8:
s8 += val
elif 8 < a2 <= 12:
s8_12 += val
elif 12 < a2 <= 15:
s12_15 += val
elif 15 < a2 <= 18:
s15_18 += val
elif 18 < a2 <= 22:
s18_22 += val
elif 22 < a2 <= 30:
s22_30 += val
else:
l30 += val

data = [s8, s8_12, s12_15, s15_18, s18_22, s22_30, l30]
plt.figure(1, dpi=100)
plt.pie(
data, # 每个饼的实际数据 若大于1 会进行归一化 计算百分比
explode=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1], # 每个饼块离中心距离
colors=[‘y‘, ‘r‘, ‘g‘, ‘#89e8e1‘, ‘#69e8a1‘, "#98e8e1", "#46e8e2"], # 每个饼块的颜色
labels=[‘<=8K‘, ‘8-12k‘, ‘12-15k‘, ‘15-18k‘, ‘18-22k‘, "22-30k", ">=30k"], # 每个饼块的标签
labeldistance=1.1, # 每个饼块标签离中心的距离
autopct=‘%1.1f%%‘, # 百分比的显示模式
pctdistance=0.6, # 百分比离中心的距离
shadow=False, # 每个饼块是否显示阴影
startangle=90, # 默认从x正半轴逆时针起
radius=1.2 # 饼块半径
)
plt.show()

def main(self):
self.ansaly_data()
if os.path.exists(os.path.join(os.getcwd(),"lagou.json")):
self.create_report()
else:
print("json文件未生成")

python3-对拉钩网数据爬取及简单的数据分析

原文:https://www.cnblogs.com/zhouzetian/p/12161609.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!