首页 > 编程语言 > 详细

python爬取链家租房信息

时间:2020-04-15 00:54:21      阅读:62      评论:0      收藏:0      [点我收藏+]
import requests as rq
from bs4 import BeautifulSoup
import json
import time
import pandas as pd

home_url = https://bj.lianjia.com/zufang
headers = {
    User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36
}

# 首页
home_rt = rq.get(home_url, headers=headers).text
home_soup = BeautifulSoup(home_rt, lxml)

# 从首页获取到各个区域的入口链接
district_url_rt = home_soup.find_all(li, attrs={class: filter__item--level2, data-type: district}) 
district_urls = []
for i in range(1,len(district_url_rt)):
    district_name = district_url_rt[i].a.string  # 区域名称
    dis_url = district_url_rt[i].a.attrs[href]  
    dis_url = https://bj.lianjia.com + dis_url # 区域链接
    district_urls.append([district_name, dis_url])

print(district_urls)
print(区域接口获取完毕)

finally_house_result = []
# 遍历各个区域链接,分别从每个入口中获取到信息
for dis_url in district_urls:
    time.sleep(5)
    district_name = dis_url[0] + 
    district_url = dis_url[1]
    district_rt = rq.get(district_url, headers=headers)
    district_rt = district_rt.text
    district_soup = BeautifulSoup(district_rt, lxml)
    page_num = int(district_soup.find(div, attrs={class: content__pg}).attrs[data-totalpage])  # 当前区域房屋信息 网页数
    
    # 遍历所有页,获取所有页 房屋标题+url
    house_titurl = []
    for page in range(1, page_num+1):
        time.sleep(0.8)
        page_url = district_url + f/pg{page}  # 当前页面链接
        page_results = rq.get(page_url, headers=headers).text
        page_soup = BeautifulSoup(page_results)
        current_page_rts = page_soup.find_all(div, attrs={class: content__list--item})  # 当前页面区域房屋信息列表
        
        # 遍历当前页面,获取 所有房屋 标题+ url
        for houselist_rt in current_page_rts:  
            house_url = https://bj.lianjia.com + houselist_rt.a[href]  # urs
            house_title = houselist_rt.a.img[alt]  # 标题          
            address_list = houselist_rt.div.find(p, attrs={class: content__list--item--des}).find_all(a)
            address = address_list[1].string + . + address_list[2].string  # 地址
            house_titurl.append([house_title, address, house_url])
    district_num = len(house_titurl)
    print(f{district_name}房屋标题&url获取完毕,共{district_num}套租房信息)
    
    # 遍历当前区域所有的房屋标题+链接,获取房屋具体信息
    for house_page in house_titurl:
        time.sleep(0.6)
        house_title = house_page[0]  # 房屋标题
        address = house_page[1]  # 地址
        house_url = house_page[2]  # 房屋链接
        house_rt = rq.get(house_url, headers=headers).text
        house_soup = BeautifulSoup(house_rt)
        
        house_rt1 = house_soup.find_all(li, attrs={class: table_col})
        pay_method = house_rt1[5].string  # 支付方式
        rent = house_rt1[6].string + house_rt1[1].find(span).string  # 房租
        deposit = house_rt1[7].string + house_rt1[2].find(span).string  # 押金
        service_fee = house_rt1[8].string + house_rt1[3].find(span).string  # 服务费
        agency_fee = house_rt1[9].string + house_rt1[4].find(span).string  # 中介费
        
        house_rt2 = house_soup.find_all(li, attrs={class: fl oneline})
        size = house_rt2[1].string[3:]  # 面积
        toward = house_rt2[2].string[3:]  # 朝向
        in_time = house_rt2[5].string[3:]  # 入住时间
        rent_term = house_rt2[7].string[3:]  # 租期
        storey = house_rt2[10].string[3:]  # 楼层
        elevator = house_rt2[11].string[3:]  # 电梯
        gas = house_rt2[17].string[3:]  # 燃气

        # 配套设施
        supporting_facilities = []
        for faci in range(21, len(house_rt2)):
            supporting_facilities.append(house_soup.find_all(li, attrs={class: fl oneline})[faci].text.strip())
        supporting_facilities = json.dumps(supporting_facilities, ensure_ascii=False)
        
        # 中介信息
        agency_names = house_soup.find_all(a, attrs={class: name})
        agency_phones = house_soup.find_all(div, attrs={class: phone})
        agency_scores = house_soup.find_all(div, attrs={class: rate})
        agency_list = []
        for name, phone, score in zip(agency_names, agency_phones, agency_scores):
            agency_list.append({中介姓名: name.string, 电话: phone.string, 评分: score.text.strip()})
        agency_list = json.dumps(agency_list, ensure_ascii=False)
        
        finally_house_result.append([district_name, address, house_title, size, toward, storey, elevator, gas, supporting_facilities, rent_term, in_time, rent, deposit, service_fee, agency_fee, agency_list])
    print(f{district_name}房屋信息获取完毕,共{district_num}套)

data_num = len(finally_house_result)
columns = [区域, 地址, 标题, 面积, 朝向, 楼层, 电梯, 燃气, 配套设施, 租期, 入住时间, 房租, 押金, 服务费, 中介费, 中介联系方式]
house_finally_dfdata = pd.DataFrame(finally_house_result, columns=columns)
house_finally_dfdata.to_excel(d:\\Desktop\\20191124链家北京各城区租房信息.xlsx)
print(f北京市各城区租房信息获取完毕,共{data_num}套)

 

python爬取链家租房信息

原文:https://www.cnblogs.com/jaysonteng/p/12702066.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!