首页 > 其他 > 详细

爬取进阶-爬取拉勾网数据

时间:2020-11-09 09:49:58      阅读:39      评论:0      收藏:0      [点我收藏+]

.py文件

import requests
import math
import time
import pandas as pd


def get_json(url, num):
    """
    从指定的url中通过requests请求携带请求头和请求体获取网页中的信息,
    :return:
    """
    url1 = https://www.lagou.com/jobs/list_python%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88?labelWords=&fromSearch=true&suginput=
    headers = {
        User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36,
        Host: www.lagou.com,
        Referer: https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=,
        X-Anit-Forge-Code: 0,
        X-Anit-Forge-Token: None,
        X-Requested-With: XMLHttpRequest
    }
    data = {
        first: true,
        pn: num,
        kd: python工程师}
    s = requests.Session()
    print(建立session:, s, \n\n)
    s.get(url=url1, headers=headers, timeout=3)
    cookie = s.cookies
    print(获取cookie:, cookie, \n\n)
    res = requests.post(url, headers=headers, data=data, cookies=cookie, timeout=3)
    res.raise_for_status()
    res.encoding = utf-8
    page_data = res.json()
    print(请求响应结果:, page_data, \n\n)
    return page_data


def get_page_num(count):
    """
    计算要抓取的页数,通过在拉勾网输入关键字信息,可以发现最多显示30页信息,每页最多显示15个职位信息
    :return:
    """
    page_num = math.ceil(count / 15)
    if page_num > 30:
        return 30
    else:
        return page_num


def get_page_info(jobs_list):
    """
    获取职位
    :param jobs_list:
    :return:
    """
    page_info_list = []
    for i in jobs_list:  # 循环每一页所有职位信息
        job_info = []
        job_info.append(i[companyFullName])
        job_info.append(i[companyShortName])
        job_info.append(i[companySize])
        job_info.append(i[financeStage])
        job_info.append(i[district])
        job_info.append(i[positionName])
        job_info.append(i[workYear])
        job_info.append(i[education])
        job_info.append(i[salary])
        job_info.append(i[positionAdvantage])
        job_info.append(i[industryField])
        job_info.append(i[firstType])
        job_info.append(i[companyLabelList])
        job_info.append(i[secondType])
        job_info.append(i[city])
        page_info_list.append(job_info)
    return page_info_list


def main():
    url =  https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
    first_page = get_json(url, 1)
    total_page_count = first_page[content][positionResult][totalCount]
    num = get_page_num(total_page_count)
    total_info = []
    time.sleep(10)
    print("python开发相关职位总数:{},总页数为:{}".format(total_page_count, num))
    for num in range(1, num + 1):
        # 获取每一页的职位相关的信息
        page_data = get_json(url, num)  # 获取响应json
        jobs_list = page_data[content][positionResult][result]  # 获取每页的所有python相关的职位信息
        page_info = get_page_info(jobs_list)
        print("每一页python相关的职位信息:%s" % page_info, \n\n)
        total_info += page_info
        print(已经爬取到第{}页,职位总数为{}.format(num, len(total_info)))
        time.sleep(20)
        # 将总数据转化为data frame再输出,然后在写入到csv各式的文件中
        df = pd.DataFrame(data=total_info,
                          columns=[公司全名, 公司简称, 公司规模, 融资阶段, 区域, 职位名称, 工作经验, 学历要求, 薪资, 职位福利, 经营范围,
                                   职位类型, 公司福利, 第二职位类型, 城市])
        df.to_csv(Python_development_engineer2.csv, index=False)
        print(python相关职位信息已保存)


if __name__ == __main__:
    main()

JupyterLab中通过 %load lagou-Copy1.py 和%run lagou-Copy1.py运行(注:这里df.to_csv(‘Python_development_engineer2.csv‘, index=False)的.csv文件它是自己会生成的不要预先创建)

技术分享图片

 

 技术分享图片

 

 技术分享图片

 

 技术分享图片

 

爬取进阶-爬取拉勾网数据

原文:https://www.cnblogs.com/chenaiiu/p/13946540.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!