from selenium import webdriver import os import time import pymysql from bs4 import BeautifulSoup import requests import threading from selenium.webdriver.common.keys import Keys h, pt, u, p, db = ‘localhost‘, 3306, ‘root‘, ‘‘, ‘qqzone‘ def mysql_fetch(sql, res_type=‘tuple‘): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset=‘utf8mb4‘) except Exception as e: print(e) return () if res_type == ‘dic‘: cursor = conn.cursor(pymysql.cursors.DictCursor) else: cursor = conn.cursor() cursor.execute(sql) conn.commit() r = cursor.fetchall() cursor.close() conn.close() return r def mysql_write(sql): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset=‘utf8mb4‘) except Exception as e: print(e) return 1 cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return 0 # D:\pyaction\toutiao_team_win img_dir = ‘C:/Users/Administrator/Desktop/1/toutiao_team/dl_img/‘ img_dir = ‘D:/pyaction/toutiao_team_win/dl_img/‘ import random def spider_webimg_dl_return_local_img_path(img_dir, img_url, media_type=‘img‘, local_default=‘default.DONOT_REMOVE.png‘): r = ‘%s%s‘ % (img_dir, local_default) if media_type == ‘img‘: try: req = requests.get(img_url) time.sleep(3) if req.status_code != 200: print(‘-!=200‘) return r time.sleep(30) print(img_url) bytes = req._content # r = ‘%s%s%s%s%s‘ % ( # img_dir, time.strftime(‘%Y%m%d%H%M%S‘, time.localtime(time.time())), str(threading.get_ident()), # img_url.split(‘!/c‘)[0].split(‘/‘)[-1].replace(‘*‘, ‘_‘), ‘.png‘) # print(r) r = ‘%s%s%s%s%s‘ % ( img_dir, time.strftime(‘%Y%m%d%H%M%S‘, time.localtime(time.time())), str(threading.get_ident()), str(random.randrange(1000, 9999)), ‘.png‘) print(r) filter_l = [‘&‘, ‘=‘, ‘?‘, ‘-‘] for fi in filter_l: r.replace(fi, ‘‘) if bytes != 0: with open(r, ‘wb‘)as f: f.write(bytes) except Exception as e: print(e) elif media_type == ‘mp4‘: try: time.sleep(30) print(img_url) r = ‘%s%s%s%s%s‘ % ( img_dir, time.strftime(‘%Y%m%d%H%M%S‘, time.localtime(time.time())), str(threading.get_ident()), img_url.split(‘.mp4?‘)[0].split(‘/‘)[-1].replace(‘*‘, ‘_‘), ‘.mp4‘) print(r) req = requests.get(img_url) time.sleep(3) if req.status_code != 200: print(‘-!=200‘) return ‘‘ bytes = req._content time.sleep(210) if bytes != 0: with open(r, ‘wb‘)as f: f.write(bytes) except Exception as e: r = ‘‘ print(e) return r driver = webdriver.Chrome() myurl = ‘https://weibo.com/u/1779073702‘ myurl = ‘https://weibo.com/u/1779073702?is_all=1‘ myurl = ‘https://weibo.com/login.php‘ driver.get(myurl) # 此处需要内存和cpu空余,能够支持dom解析和处理重js页面 time.sleep(10) driver.refresh() time.sleep(10) # xp = ‘//*[@id="pl_common_top"]/div/div/div[3]/div[2]/ul/li[3]/a‘ # try: # # 此处解决了不能点击该元素报错,第三次尝试ok # driver.find_element_by_xpath(xp).click() # time.sleep(40) # # except Exception as e: # print(e) # os._exit(1024) js = ‘document.getElementsByClassName("username")[1].childNodes[0].value="p.cn";‘ ‘document.getElementsByClassName("password")[0].childNodes[0].value="welcome";‘ ‘document.getElementsByClassName("form_login_register")[0].childNodes[5].childNodes[0].click();‘ js = ‘document.getElementById("loginname").value="p.cn";‘ ‘document.getElementsByName("password")[0].value="welcome";‘ ‘document.getElementsByClassName("W_btn_a btn_32px")[0].click();‘ try: driver.execute_script(js) time.sleep(30) except Exception as e: print(e) os._exit(1024) time.sleep(random.randrange(3, 6)) for isc in range(2): # 今日头条 非iframe 无限次 # qq空间说说 iframe 固定20条 2次报错 # memory cpu time.sleep(1) js = ‘window.scrollTo(0,document.body.scrollHeight)‘ driver.execute_script(js) while True: sql = ‘SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE lefttimes_weibo>0 AND INSTR(imgurls,".mp4")=0 AND id IN ( SELECT MAX(id) FROM qqzoneshuoshuo GROUP BY id_site) ORDER BY time_script DESC,id ASC ;‘ res_content = mysql_fetch(sql, ‘dic‘) print(res_content) if len(res_content) == 0: continue comment_l_sq = 0 for i in res_content[0:]: # id, words,imgurls,time_site dbid, content, img_list, time_site = i[‘id‘], i[‘words‘], i[‘imgurls‘], i[‘time_site‘] if 1 > 13: if ‘天‘ in time_site or ‘月‘ in time_site: continue lh = int(time.strftime("%H", time.localtime())) if lh - int(time_site.split(‘:‘)[0]) >= 24: continue if ‘早安‘ in content and lh >= 11: continue elif ‘晚安‘ in content and lh <= 20: continue time.sleep(10) # because another element <div> obscures it content = content.split(‘展开全文‘)[0].split(‘上传‘)[0].split(‘浏览‘)[0].replace(‘"‘, ‘ ‘).replace("‘", ‘ ‘) content = content.replace(‘"‘, ‘ ‘).replace("‘", ‘ ‘).replace(‘\n‘, ‘ ‘) filter_l = [‘密龄素材空间‘, ‘评论‘] for fi in filter_l: content = content.replace(fi, ‘ ‘) # js = ‘document.getElementsByTagName("textarea")[0].value="{}新年快乐-密龄白藜芦醇DOAEZ朵韵诗-阿静艾卡尔@ http://www.icarei.cn期待与你携手前行!!";‘.format( # content) # js = ‘document.getElementsByTagName("textarea")[0].value="{}白藜芦醇-燕窝美妆-密龄DOAEZ朵韵诗-阿静艾卡尔@ http://www.icarei.cn期待与你携手前行!!";‘.format( # content) js = ‘document.getElementsByTagName("textarea")[0].value="{}南京同仁堂密龄白藜芦醇-燕窝美妆-DOAEZ朵韵诗-阿静@ http://www.icarei.cn期待与你携手前行!!";‘.format( content) # https://item.taobao.com/item.htm?id=567557180229 ad_url_l = [‘567557180229‘, ‘565875313425‘, ‘545159271159‘, ‘546048319163‘] # 补水喷雾 手链 面膜 防晒喷雾 ad_url_l = [‘567557180229‘, ‘565875313425‘, ‘545159271159‘, ‘546048319163‘, ‘567693004121‘] # 补水喷雾 手链 面膜 防晒喷雾 ad_this = ad_url_l[int(time.time()) % len(ad_url_l)] ad_url = ‘https://item.taobao.com/item.htm?id={}‘.format(ad_this) # js = ‘document.getElementsByTagName("textarea")[0].value="{}#话题# 磁石娃娃 南京同仁堂密龄白藜芦醇-燕窝美妆-DOAEZ朵韵诗-阿静@ {}!!";‘.format( # js = ‘document.getElementsByTagName("textarea")[0].value="{}#doaez朵韵诗磁石娃娃燕窝润颜面膜# 磁石娃娃 南京同仁堂密龄白藜芦醇-燕窝美妆-DOAEZ朵韵诗-阿静@ {}!!";‘.format( # content, ad_url) # print(js) hot_topic_list_url = ‘https://weibo.com/u/1779073702/home‘ js = ‘window.location.href="{}"‘.format(hot_topic_list_url) driver.execute_script(js) time.sleep(10) time.sleep(10) hot_url_l = [i.get_attribute(‘href‘) for i in driver.find_elements_by_css_selector(‘li>p>a‘)] try: hot_url_l_index = random.choice([int(time.time()) % len(hot_url_l), 0, 1]) except Exception as e: print(e) continue js = ‘window.location.href="{}"‘.format(hot_url_l[hot_url_l_index]) driver.execute_script(js) time.sleep(10) driver.refresh() time.sleep(random.randrange(3, 6)) # for isc in range(2): # # 今日头条 非iframe 无限次 # # qq空间说说 iframe 固定20条 2次报错 # # memory cpu # time.sleep(1) # js = ‘window.scrollTo(0,document.body.scrollHeight)‘ # driver.execute_script(js) # driver.refresh() # time.sleep(10) time.sleep(15) comment_l = driver.find_elements_by_css_selector(‘.WB_row_line>li:nth-child(3)>a>span>span>span‘) ele_clickable = False for isc in range(20): time.sleep(1) js = ‘window.scrollTo(0,{})‘.format(isc * 50) driver.execute_script(js) time.sleep(2) try: # comment_l_sq = random.choice([0, 0, int(time.time()) % len(comment_l)]) comment_l_sq = random.choice([0, 0, 1, 1, 1, 2, 2, 3]) comment_l[comment_l_sq].click() comment_l_sq += 1 comment_l_sq = 0 ele_clickable = True break except Exception as e: print(e) continue if not ele_clickable: continue time.sleep(12) # ‘.WB_publish>div>textarea‘ mytopic, myname = ‘ #doaez朵韵诗磁石娃娃燕窝润颜面膜# ‘, ‘南京同仁堂密龄白藜芦醇-燕窝美妆-DOAEZ朵韵诗-阿静@ ‘ mystr = ‘{}{}{}{}‘.format(mytopic, myname, content, ad_url) js = ‘document.getElementsByTagName("textarea")[1].value="{}"‘.format(mystr) try: # 需要键盘事件 - response driver.find_elements_by_tag_name("textarea")[1].send_keys(Keys.SPACE) time.sleep(2) driver.find_elements_by_tag_name("textarea")[1].send_keys(Keys.BACK_SPACE) driver.execute_script(js) time.sleep(2) except Exception as e: print(e) continue js = "document.getElementsByName(‘forward‘)[0].click();" driver.execute_script(js) time.sleep(2) js = "document.getElementsByClassName(‘btn W_fr‘)[0].childNodes[0].click()" driver.execute_script(js) time.sleep(2) driver.refresh() # # 先填充文本:动态d # # om # # for iimg in range(2): # js = ‘document.getElementsByClassName("ficon_image")[0].click();‘ # driver.execute_script(js) # time.sleep(2) # upload = driver.find_element_by_id(‘pic_upload‘).find_element_by_tag_name(‘input‘) # img_url_list = img_list.split(‘,‘) # try: # # MAX=8 # for img_url in img_url_list: # if ‘.gif‘ in img_url or ‘qzonestyle‘ in img_url: # continue # local_img_path = spider_webimg_dl_return_local_img_path(img_dir, img_url, # local_default=‘default.DONOT_REMOVE.png‘) # print(local_img_path) # time.sleep(2) # upload.send_keys(local_img_path) # except Exception as e: # print(e) # try: # js = ‘document.getElementsByClassName("W_layer_close")[0].click();document.getElementsByClassName("func")[0].childNodes[3].click();‘ # driver.execute_script(js) # time.sleep(10) # driver.refresh() # except: # pass # # continue # # time.sleep(5) # js = ‘document.getElementsByClassName("W_layer_close")[0].click();document.getElementsByClassName("func")[0].childNodes[3].click();‘ # # js = ‘document.getElementsByTagName("textarea")[0].click();document.getElementsByClassName("func")[0].childNodes[3].click();‘ # # js = ‘document.getElementsByClassName("func")[0].childNodes[3].click();‘ # driver.execute_script(js) # # time.sleep(10) sql = ‘UPDATE qqzoneshuoshuo SET lefttimes_weibo=lefttimes_weibo-1 WHERE id={}‘.format(dbid) print(sql) try: mysql_write(sql) except: pass driver.refresh() time.sleep(random.randint(60 * 0.5, 60 * 1)) # 15min后刷新,循环存入数据,期间定时刷新,维持页面 for si in range(15): try: driver.refresh() time.sleep(60) time.sleep(random.randint(0, 10)) print(si) except Exception as e: print(145, e)