#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/12/24 17:01
# @Site :
# @File : shell.py
# @Software: PyCharm
import json
import urllib3
import requests
from pyquery import PyQuery
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning
headers = {
"Referer": "https://ag.fang.ke.com/loupan",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
}
def shell_room_page(pgmax):
room_page_list = []
num = 0#用于已获取页面总资源计数
for i in range(1,pgmax+1):
# urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning
url = ‘https://cq.fang.ke.com/loupan/pg‘+str(i)
print("正在获取的链接:%s"%url)
response = requests.get(url,headers=headers,verify=False)
print("正在获取%s页房源......"%i)
page_doc = PyQuery(response.text)
j = 0#y用于当前页资源计数
for item in page_doc(‘.resblock-list-wrapper li ‘).items():
# room_page_list.append(item.attr(‘data-project-name‘))
if item.attr(‘data-project-name‘)==None:
# print(item)
continue
else:
room_page_list.append(item.attr(‘data-project-name‘))
num+=1
j+=1
print("当前是第%s页,本页有%s套资源,当前共获取%s套资源!"%(i,j,num))
print(room_page_list)
return room_page_list
def shell_room_detail(list):
for j in range(0,len(list)):
try:
url = ‘https://cq.fang.ke.com/loupan/p_‘+list[j]
print url
response = requests.get(url,headers=headers,verify=False)
detail_doc = PyQuery(response.text)
price_list = []
tag_list = []
#价格获取
price = detail_doc(‘.price span‘)
for pri in price.items():
# print(pri.text())
price_list.append(pri.text())
# print(price_list)
if price_list[0]==u‘价格待定‘:#未开盘且没有参考价格
ref_ave_price = u‘未开盘,价格待定‘
ref_total_price = u‘未开盘,价格待定‘
ref_unit_price = u‘未开盘,价格待定‘
elif price_list[3]==u‘参考单价‘:#没有总价
ref_ave_price = price_list[1]+price_list[2]
ref_total_price = u‘暂无总价‘
ref_unit_price = price_list[4]+price_list[5]
else:#各报价齐全
ref_ave_price = price_list[1]+price_list[2]
ref_total_price = price_list[3]+price_list[4]
ref_unit_price = price_list[6]+price_list[7]
#最新开盘时间
for open in detail_doc(‘.open-date span‘).items():
if open.attr(‘class‘)=="content":
opendate = open.text()
break
else:
opendate = u‘未知‘
#项目地址
for addr in detail_doc(‘.info-item span‘).items():
# print(addr)
if addr.attr(‘class‘)=="content":
addres = addr.text()
break
else:
addres = u‘未知‘
#标签获取
pro_tag = ""
for tag in detail_doc(".top-info ul li").items():
if tag.attr(‘class‘)=="item":
tag_list.append(tag.text())
pro_tag = pro_tag+tag.text()+‘/‘
#户型
style = ""
for style_room in detail_doc(‘.content span‘).items():
style = style+style_room.text()+‘/‘
#楼盘在售状态和类型,只有售卖状态和类型且为必须项
type_list = []
for sell_house_type in detail_doc(‘.tags-wrap span‘).items():
type_list.append(sell_house_type.text())
if len(type_list) >=2 :
type = type_list[0]+‘/‘+type_list[1]
else:
type = None
pro_name = detail_doc(‘.title-wrap div h2‘).text()
detail_dic = {
"pro_name":pro_name,
"room_code":list[j],
"ref_ave_price":ref_ave_price,
"ref_total_price":ref_total_price,
"ref_unit_price":ref_unit_price,
"new_open_date":opendate,
"pro_addr":addres,
"pro_tag":pro_tag,
"style_room":style,
"sell_house_type":type
}
print("正在获取第%s套信息......"%(j+1))
# print("单价为:%s"%detail_doc(‘.price span‘).text())
print("第%s套信息:"%(j+1))
print(json.dumps(detail_dic,encoding=‘UTF-8‘, ensure_ascii=False))
except Exception as e: #捕获所有异常并打印
print(e)
continue
if __name__ == ‘__main__‘:
list = shell_room_page(100)
shell_room_detail(list)原文:https://www.cnblogs.com/East-fence/p/12112402.html