首页 > 其他 > 详细

简单爬虫-boss直聘信息爬取

时间:2020-08-17 08:23:27      阅读:268      评论:0      收藏:0      [点我收藏+]

最近在学习爬虫,找boss直聘练练手,下面是爬取的思路:

1.为了防止简单的反爬虫,使用了代理IP和随机生成user-agent,用方法封装了;

2.先在网页上正常访问boss,按F12查看访问信息,选择地点和搜索关键字,搜索;

3.将F12上看到的请求头信息保存下来,放在自己代码中,cookie用文件保存,需要的时候读取;

4.使用requests包进行访问,使用BeautifulSoup包和re获取需要的信息;

5.将数据放到列表中,后续可放在数据库中,也可以做个简单的数据分析。

 

数据还没有获取完,没有达到想要的效果,获取职位信息时,需要一个一个地根据职位链接发起访问请求,但是访问了几个之后,就怎么也访问不到了,即使把链接单独放到askUrl方法中也访问不了,找了半天没找到解决方案,心累...

先贴一些简单的,可以用的代码吧。

"""
1.爬取代理IP
2.随机获取代理IP
3.随机生成user-agent
4.每次请求后设置睡眠时间
"""

import requests
from bs4 import BeautifulSoup
import re
from fake_useragent import UserAgent
from time import sleep
from random import choice
from urllib.parse import quote

def main():
#getUserAgent()
#getProxyIpPool()
# ip = getProxyIP()
# print(ip)
#getBossJobs()
#askUrl()
# ip = getProxyIP()
# print(ip)
# position = quote(‘测试工程师‘)
# cityNum = ‘101280600‘
# url = getUrl(position, cityNum)
getBossJobs(url)
job_desc = getJobDesc(‘https://www.zhipin.com/job_detail/e90f31d8c5f61f9d0Xx42tS0EFM~.html‘)

print(job_desc)
# getBossJobs()

def getUserAgent():
UA = UserAgent()
ua = UA.random
return ua

def getProxyIpPool():
"""
从代理IP网站爬取多个代理IP,把IP放在列表data中
:return:
"""
data = []
url = ‘https://www.kuaidaili.com/free‘
userAgent = getUserAgent()
cookie = ‘channelid=0; sid=1596648322236711; _ga=GA1.2.333258184.1596648323; _gid=GA1.2.1094813415.1597158985; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1596648323,1596822836,1597158985; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1597158985‘

header = {‘User-Agent‘:userAgent,‘Cookie‘:cookie,‘Upgrade-Insecure-Requests‘:‘1‘}
res = requests.get(url=url, headers=header)
#print(res.text)
html = res.text
soup = BeautifulSoup(html, ‘html.parser‘)
table = soup.find_all(‘table‘, class_=‘table-bordered‘)
for index in range(len(table)):

tds = table[index]
tds = str(tds)
#print(tds)
ip = re.findall(r‘<td data-title="IP">(.*?)<‘,tds)
port = re.findall(r‘<td data-title="PORT">(.*?)<‘,tds)
#print(ip[0])
proxyip = str(ip[0]) + ":" + str(port[0])
data.append(proxyip)

#print(data)
sleep(3)
return data

def getProxyIP():
"""
随机获取代理IP
:return:
"""
data = getProxyIpPool()
proxyip = choice(data)
proxies = {
"http": "http://" + proxyip
}
#print(type(proxies))
return proxies

def getCookies():
with open(r"bosscookies.txt", "r") as rstream:
cookies = rstream.readlines()[0]
return cookies

def getUrl(position, cityNum):
position = quote(position)
url = ‘https://www.zhipin.com/job_detail/?query=‘ + position + ‘&city=‘ + cityNum + ‘&industry=&position=‘
return url

def askUrl(url):

proxies = getProxyIP()
userAgent = getUserAgent()
accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
acceptencoding = "gzip, deflate, br"
acceptlanguage = "zh-CN,zh;q=0.9"
cachecontrol = "max-age=0"
cookie = getCookies()
referer = "https://www.zhipin.com/"
#referer = ‘https://www.zhipin.com/job_detail/?query=%E6%B5%8B%E8%AF%95%E5%B7%A5%E7%A8%8B%E5%B8%88&city=101280600&industry=&position=‘
secfetchdest = "document"
secfetchmode = "navigate"
secfetchsite = "same - origin"
secfetchuser = "?1"
upgradeinsecurerequests = "1"
#"referer": referer,
headers = {"accept":accept, "accept-encoding":acceptencoding, "accept-language":acceptlanguage, "cache-control":cachecontrol,
"referer": referer,"cookie":cookie,"sec-fetch-dest":secfetchdest, "sec-fetch-mode":secfetchmode, "sec-fetch-site":secfetchsite,
"sec-fetch-user":secfetchuser, "upgrade-insecure-requests":upgradeinsecurerequests,"user-agent":userAgent}
#print(headers)
res = requests.get(url=url, headers=headers, proxies=proxies)

return res.text

def getHtml(htmlname):
"""
读出HTML文件
:return: html
"""
with open(htmlname, "r",encoding=‘UTF-8‘) as htmlstream:
html = htmlstream.read()
return html

def getBossJobs(position, cityNum):
"""
获取boss直聘上前10页的某个职位的岗位信息
爬取数据包括:职位title,详细职位信息链接,办公地点,薪资,发布时间,招聘要求,招聘公司
:return:
"""

start_html = ‘https://www.zhipin.com‘
datalist = [] #存放所有数据
#访问boss网站
url = getUrl(position, cityNum)
html = askUrl(url)

soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all(‘div‘,class_="job-primary"):
data = [] #存放每一个item中所需要的数据
#print(type(item))
job_name_info = item.find_all(‘span‘, class_ = ‘job-name‘)[0]
#print(type(job_name_info))
#用正则获取具体信息
job_link = start_html + job_name_info.find_all(‘a‘)[0][‘href‘] #职位详细信息链接
job_title = job_name_info.find_all(‘a‘)[0][‘title‘] #职位title
data.append(job_title)
data.append(job_link)
#print(job_link)
job_area = re.findall(‘<span class="job-area">(.*?)</span>‘,str(item))[0] #工作地点
data.append(job_area)
job_salary = re.findall(‘<span class="red">(.*?)</span>‘, str(item))[0] #薪资
data.append(job_salary)
job_salary = re.findall(‘<span class="red">(.*?)</span>‘, str(item))[0] # 薪资
data.append(job_salary)
job_pub_time = re.findall(‘<span class="job-pub-time">(.*?)</span>‘, str(item))[0] # 发布时间
data.append(job_pub_time)
company_info = item.find_all(‘h3‘,class_="name")[1]
company_name = company_info.find_all(‘a‘)[0][‘title‘] #招聘公司
data.append(company_name)
#job_description = getJobDesc(job_link) #职位描述信息
#sleep(3) #每次访问完后要睡眠3秒钟,以免因快速访问被反爬
#print(job_description)
# data.append(job_description)
datalist.append(data)
print(datalist)
return datalist

def getJobDesc(url):
# 职位描述
#job_detail_html = getHtml("job_detail.html")
job_detail_html = askUrl(url)
print(job_detail_html)

job_detail_soup = BeautifulSoup(job_detail_html, "html.parser")
job_description = job_detail_soup.find_all(‘div‘, class_="text")[0] #爬取职位描述信息
# print(type(job_description))
#print(job_description)
#job_desc = re.findall(r‘>(.*?)‘, str(job_description))
job_desc = job_description.text.replace(‘\n‘, "").strip() #对职位描述信息进行去除换行和空格的处理
#print(job_desc)
return job_desc

if __name__ == ‘__main__‘:
main()

  

简单爬虫-boss直聘信息爬取

原文:https://www.cnblogs.com/dayan007/p/13515318.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!