from bs4 import BeautifulSoup import bs4, csv import time from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait browser = webdriver.Chrome () browser.get (‘http://data.eastmoney.com/notices/‘) wait = WebDriverWait (browser, 10) # browser.find_element_by_css_selector(‘#dt_1‘).click() table_emergence = wait.until (EC.presence_of_element_located ((By.CSS_SELECTOR, ‘#dt_1‘))) ###################信息提取###################### info = [] def get_info(html): soup = BeautifulSoup (html, ‘lxml‘) table = soup.find (name=‘table‘, attrs={‘‘‘id‘‘‘: ‘dt_1‘}) trs = table.find (‘tbody‘).children for tr in trs: if isinstance (tr, bs4.element.Tag): tds = tr.find_all (‘td‘) code = tds[0].a.string name = tds[1].a.string title = tds[3].a.string title_type = tds[4].span.string time = tds[5].span.string sub_info = [code, name, title, title_type, time] info.append (sub_info) #############翻页操作###################### def next_page(page_number): try: wait = WebDriverWait (browser, 20) inputs = wait.until (EC.presence_of_element_located ((By.CSS_SELECTOR, ‘#PageContgopage‘))) submit = wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR, ‘#PageCont > a.btn_link‘))) inputs.clear () inputs.send_keys (page_number) submit.click () wait.until (EC.text_to_be_present_in_element ((By.CSS_SELECTOR, ‘#PageCont > span.at‘), str (page_number))) except TimeoutException: next_page (page_number) ######################保存数据################################## def save_data(data): with open (‘股票数据.csv‘, ‘w‘, newline=‘‘, encoding=‘utf-8‘) as f: writer = csv.writer (f) writer.writerow ([‘代码‘, ‘名称‘, ‘公告标题‘, ‘公告类型‘, ‘公告日期‘]) for a in data: print (a) writer.writerow (a) for i in range (0, 6): get_info (browser.page_source) next_page (i + 2) time.sleep (2) save_data (info) browser.close ()
selenium.webdriver 模拟自动化抓取网页数据
原文:https://www.cnblogs.com/yaoyue68/p/13790615.html