import xlrd
import time
import sys
import os
import requests
import sqlite3
import threading
curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)
MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST = 1700, ‘天配额超限,限制访问‘, ‘DB_KEY_EXHAUST‘
db = ‘py_bdspider_status.db‘
db = ‘%s\\%s‘ % (curPath, db)
# pcity_list = []
# pcity_file = ‘%s\\%s‘ % (curPath, ‘省会城市.txt‘)
# with open(pcity_file, ‘r‘, encoding=‘utf-8‘) as pf:
# c_ = 0
# for i in pf:
# c_ += 1
# if c_ == 3:
# c_ = 0
# pcity_list.append(i.replace(‘ ‘, ‘‘).replace(‘\n‘, ‘‘) + ‘市‘)
# pcity_sorted_list = sorted(pcity_list)
#
# target_city_list_big = [‘广州市‘, ‘厦门市‘, ‘深圳市‘, ‘北京市‘, ‘杭州市‘, ‘成都市‘, ‘上海市‘, ‘西安市‘]
# target_city_list_pass = target_city_list_big
#
# for i in pcity_list:
# if i not in target_city_list_big:
# target_city_list_pass.append(i)
# def db_init_key_table():
# conn = sqlite3.connect(db)
# c = conn.cursor()
# sql = ‘DELETE FROM baidu_map_key_used‘
# c.execute(sql)
# conn.commit()
# pcity_file = ‘%s\\%s‘ % (curPath, ‘bdmap_key.txt‘)
# with open(pcity_file, ‘r‘, encoding=‘utf-8‘) as pf:
# c_ = 0
# for i in pf:
# if len(i) < 4:
# continue
# author, key = i.replace(‘ ‘, ‘‘).replace(‘\n‘, ‘‘).replace(‘\t‘, ‘‘).split(‘;‘)
# localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
# sql = ‘INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ‘ % (
# author, key, localtime_, 0)
# c.execute(sql)
# conn.commit()
# conn.close()
# pf.close()
# db_init_key_table()
# target_city_list = target_city_list[0:11]
# target_city_list = target_city_list[0:11]
def db_get_one_effective():
conn = sqlite3.connect(db)
c = conn.cursor()
sql = ‘SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC‘ % (MAX_USED_TIMES)
res = c.execute(sql).fetchone()
if res is None:
return DB_KEY_EXHAUST
else:
return res[0]
conn.close
def db_update_one_today_used(key):
conn = sqlite3.connect(db)
c = conn.cursor()
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
sql = ‘UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ‘ % (
localtime_, key)
c.execute(sql)
conn.commit()
conn.close()
dir_, dir_exception, requested_file_list = ‘baidu_map_uid_page‘, ‘baidu_map_uid_page_exception‘, []
requested_file_dir_str, requested_file_dir_exception_str = ‘%s\\%s\\‘ % (curPath, dir_), ‘%s\\%s\\‘ % (
curPath, dir_exception)
requested_file_dir = os.listdir(requested_file_dir_str)
def gen_requested_file_list(file_postfix=‘.html‘):
filepath = ‘%s\\%s‘ % (curPath, dir_)
pathDir = os.listdir(filepath)
for allDir in pathDir:
child = os.path.join(‘%s%s‘ % (filepath, allDir))
requested_file = child.split(dir_)[1].split(‘&‘)[0].split(file_postfix)[0]
if requested_file not in requested_file_list:
requested_file_list.append(requested_file)
def gen_file_data(fname_source, file_type=‘.xlsx‘):
fname_open = ‘%s\\%s‘ % (curPath, fname_source)
excel_ = ‘%s%s‘ % (fname_open, file_type)
book = xlrd.open_workbook(excel_, on_demand=True)
sheet = book.sheet_by_index(0)
data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
book.release_resources()
del book
return data
request_dic, target_type_list, target_type_except_list = {}, [‘售楼处‘, ‘酒店‘, ‘专科医院‘, ‘家电‘, ‘家居建材‘, ‘咖啡馆‘], [‘住宅小区‘, ‘写字楼‘,
‘商场‘, ‘小学‘,
‘中学‘, ‘4S店‘,
‘汽车站‘, ‘火车站‘,
‘高铁站‘, ‘飞机场‘]
file_postfix_l = [‘.html‘, ‘.txt‘]
for i in file_postfix_l:
gen_requested_file_list(i)
fname_source = ‘【TEAM】采集员新增任务133598条-楼宇归集-互异百度uid数51700‘
# fname_source = ‘【TEAM】采集员新增任务133598条-楼宇归集-互异百度uid数51700-is_building170901140053‘
data_selfadd = gen_file_data(fname_source)
def replace_illeagl_tag(str_):
l = [‘ ‘, ‘\n‘, ‘\t‘]
for i in l:
str_ = str_.replace(i, ‘‘)
return str_
# gen_requested_file_list()
# gen_requested_file_list(‘.txt‘)
for l in data_selfadd:
id, area_code, type_, city, district, uid, name, address, street, name_reduction, submit_time = l
# id, area_code, type_, city, district, uid, name, address, street, name_reduction, submit_time, is_building, name_, addr_ = l
if len(uid.replace(‘ ‘, ‘‘)) < 6:
continue
# if type_ in target_type_except_list:
# continue
# if len(uid.replace(‘ ‘, ‘‘)) < 6 or is_building == ‘0‘ or is_building == ‘1‘:
# continue
city, district, uid = replace_illeagl_tag(city), replace_illeagl_tag(district), replace_illeagl_tag(uid)
input_ = ‘%s%s%s‘ % (city, district, uid)
if input_ in requested_file_list:
print(‘requested‘, input_)
continue
if city not in request_dic:
request_dic[city] = {}
if district not in request_dic[city]:
request_dic[city][district] = {}
request_dic[city][district][‘uid_list‘] = []
request_dic[city][district][‘file_row_list‘] = []
if uid not in request_dic[city][district][‘uid_list‘]:
request_dic[city][district][‘uid_list‘].append(uid)
request_dic[city][district][‘file_row_list‘].append(l)
del data_selfadd
fname_source = ‘【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339‘
# fname_source = ‘【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339-is_building170901140150‘
data_jmtool = gen_file_data(fname_source)
for l in data_jmtool:
dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href = l
# dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href, is_building, name_, addr_ = l
# if type_ in target_type_except_list:
# continue
city, district, uid = replace_illeagl_tag(city), replace_illeagl_tag(district), replace_illeagl_tag(uid)
if len(uid.replace(‘ ‘, ‘‘)) < 6:
continue
# if len(uid.replace(‘ ‘, ‘‘)) < 6 or is_building == ‘0‘ or is_building == ‘1‘:
# continue
input_ = ‘%s%s%s‘ % (city, district, uid)
if input_ in requested_file_list:
print(‘requested‘, input_)
continue
if city not in request_dic:
request_dic[city] = {}
if district not in request_dic[city]:
request_dic[city][district] = {}
request_dic[city][district][‘uid_list‘] = []
request_dic[city][district][‘file_row_list‘] = []
if uid not in request_dic[city][district][‘uid_list‘]:
request_dic[city][district][‘uid_list‘].append(uid)
request_dic[city][district][‘file_row_list‘].append(l)
del data_jmtool
write_res_file_dir = ‘%s\\%s\\‘ % (curPath, dir_)
ex_l = [‘Proxy Error‘, ‘APP IP校验失败‘]
def write_res_file(str_, input_, dir_=write_res_file_dir, file_postfix=‘.txt‘):
for ex in ex_l:
if str_.find(ex) > -1:
print(‘EXCEPTION-‘, ex)
return
fname = ‘%s%s%s‘ % (dir_, input_, file_postfix)
with open(fname, ‘w‘, encoding=‘utf-8‘) as ft:
ft.write(str_)
ft.close()
print(‘ok‘, threading.get_ident(), input_)
class MyThread(threading.Thread):
def __init__(self, func, args, name):
threading.Thread.__init__(self)
self.name, self.func, self.args = name, func, args
def run(self):
self.func(self.args)
requested_type_counter = 0
base_url = ‘http://api.map.baidu.com/place/v2/detail?uid=UID&output=json&scope=2&ak=AK‘
def fun_(city):
for district in request_dic[city]:
for uid in request_dic[city][district][‘uid_list‘]:
ak = db_get_one_effective()
if ak == DB_KEY_EXHAUST:
print(DB_KEY_EXHAUST)
break
else:
url_ = base_url.replace(‘UID‘, uid).replace(‘AK‘, ak)
input_ = ‘%s%s%s‘ % (city, district, uid)
try:
# gen_requested_file_list()
# gen_requested_file_list(‘.txt‘)
if input_ in requested_file_list:
continue
bd_res_json_str = requests.get(url_).text
db_update_one_today_used(ak)
write_res_file(bd_res_json_str, input_)
except Exception:
bd_res_json_str = ‘请求百度-异常‘
write_res_file(bd_res_json_str, input_, requested_file_dir_exception_str)
print(bd_res_json_str, input_)
city_num, start_loop, stop_loop = len(request_dic), 0, 100
thread_max = city_num
def main():
threads_list, nloop = [], 0
request_dic_city_l = sorted(request_dic, reverse=False)
for city in request_dic_city_l:
nloop += 1
if nloop < start_loop or nloop > stop_loop:
continue
thread_instance = MyThread(fun_, (city), fun_.__name__)
threads_list.append(thread_instance)
for t in threads_list:
t.setDaemon = False
t.start()
for t in threads_list:
t.join()
if __name__ == ‘__main__‘:
main()


原文:http://www.cnblogs.com/yuanjiangw/p/7470051.html