chromdriver 需要对应chrome版本
提示权限不足,sudo chmod +x phantomjs
chromdriver --version
phantomjs --version
driver.find_element #返回第一个元素,如果没有报错
driver.find_elements #返回包含元素的列表,如果没有返回空列表
获取文本:element.text
获取属性值:element.get_attribute("href")
driver.switch_to.frame(id,name,element)
url地址不变,验证码不变
请求验证码的地址,获取响应,进行识别
url地址不变,验证码变化
请求验证码,发送登录请求,需要带上统一套cookie,才能够都能路成功,对应可以使用requests.Session()来实现
selenium处理验证码
带上selenium的driver中的cookie来请求验证码
import time from selenium import webdriver class DouYu(object): def __init__(self): # 初始化url, 驱动实例 self.start_url = "https://www.douyu.com/directory/all" self.driver = webdriver.Chrome() def parse_url(self, url): self.driver.get(url) time.sleep(5) self.driver.find_element_by_link_text("颜值").click() time.sleep(2) def extract_data_list(self): li_list = self.driver.find_elements_by_xpath( ‘//div[@class="layout-Module-container layout-Cover ListContent"]/ul/li‘ ) content_list = [] for li in li_list: item = {} item[‘title‘] = li.find_element_by_xpath(‘.//h3‘).text item[‘anchor‘] = li.find_element_by_xpath(‘.//h2‘).text item[‘watch_num‘] = li.find_element_by_xpath(‘.//span[@class="DyListCover-hot"]‘).text item[‘link‘] = li.find_element_by_xpath(‘./div/a[1]‘).get_attribute(‘href‘) content_list.append(item) next_url = self.driver.find_elements_by_xpath(‘//li[@title="下一页"][@aria-disabled="false"]‘) next_url = next_url[0] if next_url else None return content_list, next_url def save_data(self, data): with open(‘./douyu.txt‘, ‘ab+‘) as f: for v in data: tmp = str(v) + "\n" f.write(tmp.encode(‘utf-8‘)) def run(self): self.driver.maximize_window() # 获取html self.parse_url(self.start_url) # 提取数据 content_list, next_url = self.extract_data_list() # 保存 self.save_data(content_list) # 判断下一页元素是否存在 while next_url: print(next_url) next_url.click() time.sleep(2) content_list, next_url = self.extract_data_list() self.save_data(content_list) if __name__ == ‘__main__‘: douyu = DouYu() douyu.run()
原文:https://www.cnblogs.com/itelephant/p/11180313.html