要买房,但是大西安现在可谓是一房难求,大家都争先恐后地排队交资料、摇号。截止到现在,笔者已经参与过6个楼盘的摇号/选房,但种种原因,依然没买到合适的房子,无奈,一首 凉~ 凉~ 回荡在心~
价格公示,每次都会在买房群里热议,因为看到新公布的价格就意味着有更多房源即将开盘,大家买房的热情又会被重新点燃~
闪念,有一次的一个念想,如果能实时监控物价局官网、自动下载压缩包并提醒我,这样岂不快哉,于是之前了解的一点点网络爬虫就派上用场~
于是,开干!
首先,用selenium(PhantomJS)爬取网站的下载链接
然后,用Python request模块的urlretrieve() 方法下载压缩包
接着,用Python zipfile模块的extractall()方法解压
最后,定时运行脚本,有下载更新时候弹出提示
更新提醒,可以发送邮件,但是笔者用了最简单的方法,即有更新时候调用openFolder.bat批处理脚本自动打开house_prices文件夹
先上两张物价局官网
......
from selenium import webdriver
def get_url():
"""
获取url并写入txt
:return:download_flag
"""
# 页面为iframe框架
driver.switch_to.frame('iframecenter')
date_list = driver.find_elements_by_xpath('.//*[@id="tablelist"]/tbody/tr/td[3]/span')
fw = open(cur_path + "/house_prices/url_list.txt", 'a')
fr = open(cur_path + "/house_prices/url_list.txt", 'r')
download_flag = 0
# print(fr.readlines())
f_list = fr.readlines()
# print(f_list)
if len(f_list) == 0:
for i in reversed(range(len(date_list))): # 写入顺序为时间逆序
fw.writelines(driver.find_elements_by_id('linkId')[i].get_attribute('href'))
fw.write('\n')
download_flag = 1
else:
# 获取txt文件里面最新的一级下载页面链接的trid
f_latest_num = int(f_list[-1].split('=')[1])
for i in reversed(range(0, 5)):
# 获取当前网站最新的5条一级下载页面链接及trid,如果当前网站的trid > txt最新的trid,则追加到txt
latest_url = driver.find_elements_by_id('linkId')[i].get_attribute('href')
latest_url_num = int(latest_url.split('=')[1])
if f_latest_num < latest_url_num:
fw.writelines(latest_url)
fw.write('\n')
download_flag = download_flag + 1
fw.close()
return download_flag
......
from urllib import request
import zipfile
import os
......
def download_zip():
"""
从txt中读取url并下载zip
:return:
"""
flag = get_url()
if flag == 1:
fr = open(cur_path + "/house_prices/url_list.txt", 'r')
all_lines = fr.readlines()
for line_url in all_lines:
driver.get(line_url)
driver.implicitly_wait(15)
# 页面为iframe框架
driver.switch_to.frame('showconent1')
download_url = driver.find_element_by_partial_link_text('商品住房价格')
download_url = download_url.get_attribute('href')
zipname = cur_path + '/house_prices/' + download_url.split('/')[6]
filename = zipname.split('.')[0]
request.urlretrieve(download_url, zipname)
# 解压并删除压缩包
try:
with zipfile.ZipFile(zipname) as zfile:
zfile.extractall(path=filename)
if os.path.exists(zipname):
os.remove(zipname)
except zipfile.BadZipFile as e:
print(filename + " is a bad zip file ,please check!")
# 有更新数据打开文件夹
os.system(cur_path + "/openFolder.bat")
elif flag != 1 and flag != 0:
fr = open(cur_path + "/house_prices/url_list.txt", 'r')
all_lines = fr.readlines()
for line_url in range(flag):
driver.get(all_lines[-line_url-1])
driver.implicitly_wait(15)
driver.switch_to.frame('showconent1')
# download_url = driver.find_element_by_xpath('/html/body/div/div[3]/p[3]/a')
download_url = driver.find_element_by_partial_link_text('商品住房价格')
download_url = download_url.get_attribute('href')
zipname = cur_path + '/house_prices/' + download_url.split('/')[6]
filename = zipname.split('.')[0]
request.urlretrieve(download_url, zipname)
# 解压并删除压缩包
try:
with zipfile.ZipFile(zipname) as zfile:
zfile.extractall(path=filename)
if os.path.exists(zipname):
os.remove(zipname)
except zipfile.BadZipFile as e:
print(filename + " is a bad zip file ,please check!")
# 有更新数据打开文件夹
os.system(cur_path + "/openFolder.bat")
currentpath = createobject("Scripting.FileSystemObject").GetFile(Wscript.ScriptFullName).ParentFolder.Path
createobject("wscript.shell").run currentpath + "\downloadZip.py",0
start %~dp0\house_prices
Git地址:https://gitee.com/freedomlidi/autoDownloadZip.git
原文:https://www.cnblogs.com/freedomlidi/p/12431224.html