bug
import sys import os curPath = os.path.abspath(os.path.dirname(__file__)) rootPath = os.path.split(curPath)[0] sys.path.append(rootPath) from selenium import webdriver from selenium.webdriver.chrome.options import Options # from selenium.webdriver.firefox.options import Options import time from time import sleep import math import random import sys import threading import urllib.parse import xlrd import sys import os import sqlite3 start_time, MAX_TIME = time.time(), 60 def py_stop_update_db(): db = ‘py_bdspider_status.db‘ db = ‘%s\\%s‘ % (curPath, db) conn = sqlite3.connect(db) pyname = os.path.basename(__file__).split(‘.py‘)[0] sql_ = ‘%s%s%s‘ % (‘UPDATE pystatus_table SET pystatus =2 WHERE pyname="‘, pyname, ‘"‘) print(sql_) conn.execute(sql_) conn.commit() conn.close() def chk_time(): if time.time() - start_time > MAX_TIME: py_stop_update_db() browser.delete_all_cookies() browser.quit() save_dir = ‘baidu_map_html_firstpage_pc_not_shop‘ filepath = ‘%s\\%s‘ % (curPath, save_dir) # ‘D:\\pymine\\clean\\spider_map\\baidu_map_html_firstpage_pc_not_shop‘ requested_file_list = [] pathDir = os.listdir(filepath) for allDir in pathDir: child = os.path.join(‘%s%s‘ % (filepath, allDir)) requested_file = child.split(save_dir)[1].split(‘&‘)[0].split(‘.html‘)[0] requested_file_list.append(requested_file) tag_jmtool_list = [‘(‘, ‘(‘, ‘-‘] ua_list = [] # with open(‘mobile_ua.txt‘, ‘r‘, encoding=‘utf-8‘) as uafile: # for i in uafile: # if i.find(‘Mozilla‘) > -1: # ua_list.append(i.replace(‘\n‘, ‘‘).strip()) # ua_list_len_ = len(ua_list) - 1 def extract_name(name_): for i in tag_jmtool_list: name_ = name_.split(i)[0] return name_ target_type_list = [‘住宅小区‘, ‘写字楼‘] # target_type_list = [‘住宅小区‘] target_type_list = [‘专科医院‘] target_type_list = [‘商场‘] requested_type_counter = 0 # 商场 4705 酒店 24915 专科医院 2513 商圈 334 target_dic = {} # target_city_list = [‘北京市‘, ‘上海市‘, ‘深圳市‘, ‘广州市‘] target_city_list = [‘深圳市‘, ‘广州市‘] target_city_list = [‘深圳市‘] target_city_list = [‘北京市‘, ‘上海市‘] target_city_list = [‘北京市‘, ‘上海市‘, ‘深圳市‘, ‘广州市‘] target_city_list = [‘北京市‘, ‘上海市‘] target_city_list = [‘深圳市‘, ‘广州市‘] target_city_list = [‘北京市‘] file_name = ‘JMTool任务_csv_py_wholeCSV‘ FEXCEL = ‘%s\\%s%s‘ % (curPath, file_name, ‘.xlsx‘) data = xlrd.open_workbook(FEXCEL) table = data.sheets()[0] nrows, ncols = table.nrows, table.ncols res_dic, counter_ = {}, 0 for i in range(0, nrows): l = table.row_values(i) dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, emp_, emp_1 = l if city not in target_city_list: continue type_ = ref_area_type_code if type_ not in target_type_list: continue name_reduction = extract_name(name_) if len(name_reduction) < 3: name_reduction = name_ if city not in target_dic: target_dic[city] = {} if district not in target_dic[city]: target_dic[city][district] = {} if type_ not in target_dic[city][district]: target_dic[city][district][type_] = {} if name_reduction not in target_dic[city][district]: target_dic[city][district][type_][name_reduction] = {} target_dic[city][district][type_][name_reduction][‘name_reduction_list‘] = [] target_dic[city][district][type_][name_reduction][‘history_list‘] = [] try: target_dic[city][district][type_][name_reduction][‘name_reduction_list‘].append(name_) target_dic[city][district][type_][name_reduction][‘history_list‘].append(l) except Exception: print(Exception) write_res_html_dir = ‘%s\\%s\\‘ % (curPath, ‘baidu_map_html_firstpage_pc_not_shop‘) def write_res_html(browser, dir_=write_res_html_dir): close_alert(browser) current_url_ = urllib.parse.unquote(browser.current_url) try: input_ = current_url_.split(‘&wd=‘)[1].split(‘/?‘)[0] except Exception: print(‘Exception-‘, __file__, sys._getframe().f_lineno, current_url_) return current_url_ = ‘%s%s%s‘ % (‘<!--‘, browser.current_url, ‘-->‘) page_source = ‘%s%s‘ % (current_url_, browser.page_source) # localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) # file_name = ‘%s%s%s%s‘ % (dir_, input_, localtime_, ‘.html‘) file_name = ‘%s%s%s‘ % (dir_, input_, ‘.html‘) fo = open(file_name, ‘w‘, encoding=‘utf-8‘) fo.write(page_source) fo.closed print(os.path.basename(__file__), ‘OK-writed-‘, sys._getframe().f_lineno, ‘‘) def gen_random_letter(): return chr(random.randint(97, 122)) def gen_random_num(): return random.randint(0, 10) def gen_sougo_pid(): res_ = ‘‘ for i in range(1, 17, 1): if i in [1, 3, 4, 15]: res_ = ‘%s%s‘ % (res_, gen_random_letter()) else: res_ = ‘%s%s‘ % (res_, gen_random_num()) return res_ def close_alert(browser, attitude=‘accept‘): # js=‘alert(window.alert=function(str){return;}‘ # browser.execute_script(js) # js= ‘window.alert = function(str){return ;}‘ # browser.execute_script(js) return # try: # al = browser.switch_to.alert() # sleep(1) # al.dismiss() # # if attitude == ‘accept‘: # # al.accept() # # elif attitude == ‘dismiss‘: # # al.dismiss() # print(sys._getframe().f_lineno, ‘alert-closed-ok‘) # except Exception: # print(sys._getframe().f_lineno, Exception, ‘no-alert‘) # input_ = ‘深圳市南山区荟芳园‘ # browser = webdriver.Firefox() executable_path_str = ‘%s\\%s‘ % (curPath, ‘geckodriver.exe‘) browser = webdriver.Firefox(executable_path=executable_path_str) def mobile_mobile_pages_html(input_): # mobile_emulation = { # "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}, # "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"} # ua_list_index = random.randint(0, ua_list_len_) # mobile_emulation = { # "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}} # # mobile_emulation[‘userAgent‘] = ua_list[ua_list_index] # chrome_options = Options() # chrome_options.add_experimental_option("mobileEmulation", mobile_emulation) # browser = webdriver.Chrome(chrome_options=chrome_options) # sleep(4) # # url_seed = ‘http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-%s-0007&keyword=百度地图‘ % (gen_sougo_pid()) # url_seed = ‘%s%s%s‘ % ( # ‘https://www.sogou.com/web?query=%E7%99%BE%E5%BA%A6%E5%9C%B0%E5%9B%BE&_asf=www.‘, gen_sougo_pid(), # ‘.com&_ast=&w=01019900&p=40040100&ie=utf8&from=index-nologin&s_from=index&sut=4774&sst0=1503482714549&lkt=3%2C1503482710643%2C1503482710997&sugsuv=00DA6D7D7789371D599A889761488522&sugtime=1503482714549‘) # # print(url_seed) # browser.get(url_seed) # close_alert(browser) # js = ‘%s%s%s‘ % (‘document.getElementsByClassName("input-text js_input")[0].value="‘, input_, ‘"‘) # browser.execute_script(js) # close_alert(browser) chk_time() url_ = ‘%s%s‘ % (‘http://map.baidu.com/?s=s%26wd%3D‘, input_) sleep(2) browser.get(url_) # js=‘window.alert=function(str){return;}‘ # browser.execute_script(js) # xp_newpage = ‘//*[@id="sogou_vr_21384401_wrap_0"]/div[1]/div/div[2]/div/form/input[2]‘ # browser.find_element_by_xpath(xp_newpage).click() # # js = ‘window.alert=function(str){return;}‘ # browser.execute_script(js) sleep(2) write_res_html(browser) # browser.quit() class MyThread(threading.Thread): def __init__(self, func, args, name): threading.Thread.__init__(self) self.name, self.func, self.args = name, func, args def run(self): self.func(self.args) def thread_city_district(city_district): global requested_type_counter city, district = city_district.split(‘_‘) for type_ in target_dic[city][district]: for name_reduction in target_dic[city][district][type_]: for name_ in target_dic[city][district][type_][name_reduction][‘name_reduction_list‘]: input_ = ‘%s%s%s‘ % (city, district, name_) if input_ in requested_file_list: requested_type_counter += 1 print(‘requested_type_counter=‘, requested_type_counter, input_) continue mobile_mobile_pages_html(input_) threads_list = [] for city in target_dic: for district in target_dic[city]: city_district = ‘%s_%s‘ % (city, district) thread_instance = MyThread(thread_city_district, (city_district), thread_city_district.__name__) threads_list.append(thread_instance) for t in threads_list: t.setDaemon = False t.start() for t in threads_list: t.join() sleep(2) py_stop_update_db() try: browser.quit() browser.delete_all_cookies() except Exception: print(‘last-line‘)
#! /usr/bin/env python # coding=utf-8 import time, os, sched import sqlite3 import sys import random curPath = os.path.abspath(os.path.dirname(__file__)) rootPath = os.path.split(curPath)[0] sys.path.append(rootPath) # 第一个参数确定任务的时间,返回从某个特定的时间到现在经历的秒数 # 第二个参数以某种人为的方式衡量时间 schedule = sched.scheduler(time.time, time.sleep) def perform_command(cmd, inc): # 安排inc秒后再次运行自己,即周期运行 schedule.enter(inc, 0, perform_command, (cmd, inc)) os.system(cmd) db = ‘py_bdspider_status.db‘ db = ‘%s\\%s‘ % (curPath, db) py_list = [‘bd1‘, ‘bd2‘, ‘bd3‘, ‘bd4‘] py_dir = ‘D:\\pymine\\clean\\spider_map\\‘ def gen_cmd_python_str(): conn = sqlite3.connect(db) sql = ‘SELECT * FROM pystatus_table‘ cursor = conn.execute(sql) py_db_dic = {} for row in cursor: pystatus, pyname = row py_db_dic[pyname] = pystatus len_ = len(py_list) - 1 ii = random.randint(0, len_) i = random.randint(0, ii) to_requestpy = py_list[i] if py_db_dic[to_requestpy] == 1: if i == len_: to_requestpy = py_list[i - 1] else: to_requestpy = py_list[i + 1] sql_ = ‘%s%s%s‘ % (‘UPDATE pystatus_table SET pystatus =1 WHERE pyname="‘, to_requestpy, ‘"‘) print(sql_) conn.execute(sql_) conn.commit() conn.close() return to_requestpy len_ = len(py_list) - 1 def timming_exe(inc=6): ii = random.randint(0, len_) i = random.randint(0, ii) to_requestpy = py_list[i] #to_requestpy = gen_cmd_python_str() cmd_str = ‘%s%s%s%s‘ % (‘python ‘, py_dir, to_requestpy, ‘.py‘) schedule.enter(inc, 0, perform_command, (cmd_str, inc)) # 持续运行,直到计划时间队列变成空为止 schedule.run() timming_exe(2)
原文:http://www.cnblogs.com/yuanjiangw/p/7429672.html