1: selenium 的安装与介绍
selenium模块讲解
一 什么是selenium?
最初是一个自动化测试工具。可以使用它帮我们驱动浏览器
自动去执行某些自定义好的操作。例如在页面中执行JS代码、
跳过登录验证。可以使用selenium帮我们实现爬虫。
二 为什么要使用selenium?
1、优点:
使用requests模块登录需要分析大量的复杂通信流程,使用selenium
可以轻松跳过登录验证。
2、缺点:
浏览器会加载css、js、图片、视频...数据,爬虫效率相比requests模块要低。
三 如何使用selenium?
下载selenium模块:
pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium
下载浏览器驱动:
http://npm.taobao.org/mirrors/chromedriver/2.38/
‘‘‘
驱动下载后 放进python安装目录下的 script目录下 然后添加环境变量 win7系统是 分号加script的路径
win10 直接添加就好
# selenium之第一次 from selenium import webdriver # 用来驱动浏览器的 # 调用得到一个动作链对象,破解滑动验证码的时候用的,可以拖动图片 from selenium.webdriver import ActionChains # 按照什么方式查找属性,By.ID, By.CSS_SELECTOR, By.Class from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys # 键盘按键操作 # 和下面WebDriverWait一起用的,EC是expected_conditions的别名 from selenium.webdriver.support import expected_conditions as EC # 等待页面加载某些元素 from selenium.webdriver.support.wait import WebDriverWait import time # 通过谷歌浏览器驱动打开谷歌浏览器 # webdriver.Chrome(r‘chromedriver.exe的绝对路径‘) # chrome = webdriver.Chrome(r‘D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe‘) # 括号内输入chromedriver.exe的绝对路径 # chromedriver.exe存放于python解释器的Scripts文件夹中 # chrome是一个驱动对象 chrome = webdriver.Chrome() ‘‘‘ 实例1 ‘‘‘ # 若try出现异常 # try: # # 往tank博客主页发送get请求 # # chrome.get(‘https://www.cnblogs.com/kermitjam/‘) # # # 参数1: 驱动对象 参数2: 等待时间 # wait = WebDriverWait(chrome, 10) # # # 1、访问百度 # chrome.get(‘https://www.baidu.com/‘) # # # 2、查找input输入框 # input_tag = wait.until( # # 调用EC的presence_of_element_located() # EC.presence_of_element_located( # # 此处可以写一个元组 # # 参数1: 查找属性的方式 # # 参数2: 属性的名字 # (By.ID, "kw") # ) # ) # input_tag = wait.until(EC.presence_of_element_located((By.ID, "kw"))) # # # 3、搜索一拳超人 # input_tag.send_keys(‘一拳超人‘) # # # 4、按键盘回车键 # input_tag.send_keys(Keys.ENTER) # # time.sleep(3) # # # 无论发生什么都会关闭浏览器 # finally: # # 关闭浏览器 # chrome.close() ‘‘‘ 实例2 ‘‘‘ try: # 往tank博客主页发送get请求 # chrome.get(‘https://www.cnblogs.com/kermitjam/‘) # 参数1: 驱动对象 参数2: 等待时间 wait = WebDriverWait(chrome, 10) # 1、访问京东主页 chrome.get(‘https://www.jd.com/‘) # 2、查找input输入框 input_tag = wait.until(EC.presence_of_element_located((By.ID, "key"))) # 3、搜索唐诗三百首 input_tag.send_keys(‘唐诗三百首‘) # 4、根据class属性名称查找标签 search_button = wait.until( EC.presence_of_element_located((By.CLASS_NAME, ‘button‘))) # 5、点击搜索按钮 search_button.click() time.sleep(3) # 无论发生什么都会关闭浏览器 finally: # 关闭浏览器 chrome.close()
requests的post请求
‘‘‘‘‘‘ ‘‘‘ post请求登陆github ‘‘‘ import requests import re # 一 访问login页获取token信息 ‘‘‘ 请求url: https://github.com/login 请求方式: GET 响应头: Set-Cookie 请求头: Cookie User-Agent ‘‘‘ headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36‘ } response = requests.get(url=‘https://github.com/login‘, headers=headers) # print(response.text) # 把login页返回的cookies信息转换成字典 login_cookies = response.cookies.get_dict() authenticity_token = re.findall(‘<input type="hidden" name="authenticity_token" value="(.*?)" />‘, response.text, re.S)[0] print(authenticity_token) # 二 往sessionurl发送POST请求 ‘‘‘ 请求url: https://github.com/session 请求方式: POST 请求头: # 上一次请求从哪里来 Referer: https://github.com/login Cookie:... User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36 请求体: 只有POST请求才会有请求体。 commit: Sign in utf8: ? authenticity_token: VX79esFc0YPdR1UFzUM/6MTRZOlYQ0btF5k2/x7uZea0x2E6W4bmRpwHsaCBN+096PaWNkcQjJOsyUzUqsAhIw== LLWlTr0qLcYC74hn7OI7IlyeB9rZei9737Lqtzz0sKTgY7Js7pUUhZ6bNC6lCkS+OHfVukkbTejjd0BnjPvGUg== login: tankjam1 password: ***** webauthn-support: unsupported ‘‘‘ # 拼接请求头信息 headers2 = { ‘Referer‘: ‘https://github.com/login‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36‘, } # 拼接请求体信息 form_data = { "commit": "Sign in", "utf8": "?", "authenticity_token": authenticity_token, "login": "tankjam",#这里需要填自己的博客园的用户名 "password": "kermit46709394",#填自己的博客园的密码 "webauthn-support": "unsupported", } # 往session地址发送post请求 # 携带请求头、请求体、login页的cookies信息 response2 = requests.post(url=‘https://github.com/session‘, data=form_data, headers=headers2, cookies=login_cookies) print(response2.status_code) # print(response2.text) with open(‘github.html‘, ‘w‘, encoding=‘utf-8‘) as f: f.write(response2.text)
爬西刺代理的有用ip
‘‘‘ 代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情) ‘‘‘ # import requests # proxies={ # # 带用户名密码的代理,@符号前是用户名与密码 # ‘http‘:‘http://tank:123@localhost:9527‘, # ‘http‘:‘http://localhost:9527‘, # ‘https‘:‘https://localhost:9527‘, # } # response=requests.get(‘https://www.12306.cn‘, # proxies=proxies) # # print(response.status_code) ‘‘‘ 爬取西刺免费代理: 1.访问西刺免费代理页面 2.通过re模块解析并提取所有代理 3.通过ip测试网站对爬取的代理进行测试 4.若test_ip函数抛出异常代表代理作废,否则代理有效 5.利用有效的代理进行代理测试 <tr class="odd"> <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td> <td>112.85.131.99</td> <td>9999</td> <td> <a href="/2019-05-09/jiangsu">江苏南通</a> </td> <td class="country">高匿</td> <td>HTTPS</td> <td class="country"> <div title="0.144秒" class="bar"> <div class="bar_inner fast" style="width:88%"> </div> </div> </td> <td class="country"> <div title="0.028秒" class="bar"> <div class="bar_inner fast" style="width:97%"> </div> </div> </td> <td>6天</td> <td>19-05-16 11:20</td> </tr> re: <tr class="odd">(.*?)</td>.*?<td>(.*?)</td> ‘‘‘ # import requests # import re # import time # # HEADERS = { # ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36‘, # } # # # def get_index(url): # time.sleep(1) # response = requests.get(url, headers=HEADERS) # return response # # # def parse_index(text): # ip_list = re.findall(‘<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>‘, text, re.S) # for ip_port in ip_list: # ip = ‘:‘.join(ip_port) # yield ip # # def test_ip(ip): # print(‘测试ip: %s‘ % ip) # try: # proxies = { # ‘https‘: ip # } # # # ip测试网站 # ip_url = ‘https://www.ipip.net/‘ # # # 使用有效与无效的代理对ip测试站点进行访问,若返回的结果为200则代表当前测试ip正常 # response = requests.get(ip_url, headers=HEADERS, proxies=proxies, timeout=1) # # if response.status_code == 200: # print(f‘有用的ip:{ip}‘) # return ip # # # 若ip代理无效则抛出异常 # except Exception as e: # print(e) # # # 使用代理爬取nba # def spider_nba(good_ip): # url = ‘https://china.nba.com/‘ # # proxies = { # ‘https‘: good_ip # } # # response = requests.get(url, headers=HEADERS, proxies=proxies) # print(response.status_code) # print(response.text) # # # if __name__ == ‘__main__‘: # base_url = ‘https://www.xicidaili.com/nn/{}‘ # # for line in range(1, 3677): # ip_url = base_url.format(line) # # response = get_index(ip_url) # # # 解析西刺代理获取每一个ip列表 # ip_list = parse_index(response.text) # # # 循环每一个ip # for ip in ip_list: # # print(ip) # # # 对爬取下来的ip进行测试 # good_ip = test_ip(ip) # # if good_ip: # # 真是代理,开始测试 # spider_nba(good_ip) ‘‘‘ 认证设置 ‘‘‘ import requests # 通过访问github的api来测试 url = ‘https://api.github.com/user‘ HEADERS = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36‘, } # 测试1,失败返回401 # response = requests.get(url, headers=HEADERS) # print(response.status_code) # 401 # print(response.text) ‘‘‘ 打印结果: { "message": "Requires authentication", "documentation_url": "https://developer.github.com/v3/users/#get-the-authenticated-user" } ‘‘‘ # # # 测试2,通过requests.auth内的HTTPBasicAuth进行认证,认证成功返回用户信息 # from requests.auth import HTTPBasicAuth # response = requests.get(url, headers=HEADERS, auth=HTTPBasicAuth(‘tankjam‘, ‘kermit46709394‘)) # print(response.text) # # 测试3,通过requests.get请求内的auth参数默认就是HTTPBasicAuth,认证成功返回用户信息 # response = requests.get(url, headers=HEADERS, auth=(‘tankjam‘, ‘kermit46709394‘)) # print(response.text) ‘‘‘ 上传文件 ‘‘‘ import requests # 上传文本文件 # files1 = {‘file‘: open(‘user.txt‘, ‘rb‘)} # # files参数是POST请求固定参数 # response = requests.post(‘http://httpbin.org/post‘, files=files1) # print(response.status_code) # 200 # print(response.text) # 200 # 上传图片文件 # files2 = {‘jpg‘: open(‘一拳.jpg‘, ‘rb‘)} # response = requests.post(‘http://httpbin.org/post‘, files=files2) # print(response.status_code) # 200 # print(response.text) # 200 # # 上传视频文件 # files3 = {‘movie‘: open(‘love_for_GD.mp4‘, ‘rb‘)} # response = requests.post(‘http://httpbin.org/post‘, files=files3) # print(response.status_code) # 200 # print(response.text) # 200
selenium的选择器的使用
# from selenium import webdriver # 用来驱动浏览器的 # import time # # ‘‘‘ # 隐式等待 # ‘‘‘ # # 获取驱动对象、 # driver = webdriver.Chrome() # # try: # # 显式等待: 等待某个元素加载 # # 参数1: 驱动对象 参数2: 等待时间 # # wait = WebDriverWait(chrome, 10) # # driver.get(‘https://china.nba.com/‘) # # # 隐式等待: 等待页面所有元素加载 # driver.implicitly_wait(10) # news_tag = driver.find_element_by_class_name(‘nav-news‘) # # 获取标签对象 # print(news_tag) # # 获取标签的名字 # print(news_tag.tag_name) # # # time.sleep(10) # # finally: # driver.close() from selenium import webdriver # 用来驱动浏览器的 import time ‘‘‘ ===============所有方法=================== element是查找一个标签 elements是查找所有标签 1、find_element_by_link_text 通过链接文本去找 2、find_element_by_id 通过id去找 3、find_element_by_class_name 4、find_element_by_partial_link_text 5、find_element_by_name 6、find_element_by_css_selector 7、find_element_by_tag_name ‘‘‘ # 获取驱动对象、 driver = webdriver.Chrome() try: # 往百度发送请求 driver.get(‘https://www.baidu.com/‘) driver.implicitly_wait(10) # 1、find_element_by_link_text 通过链接文本去找 # 根据登录 # send_tag = driver.find_element_by_link_text(‘登录‘) # send_tag.click() # 2、find_element_by_partial_link_text 通过局部文本查找a标签 login_button = driver.find_element_by_partial_link_text(‘登‘) login_button.click() time.sleep(1) # 3、find_element_by_class_name 根据class属性名查找 login_tag = driver.find_element_by_class_name(‘tang-pass-footerBarULogin‘) login_tag.click() time.sleep(1) # 4、find_element_by_name 根据name属性查找 username = driver.find_element_by_name(‘userName‘) username.send_keys(‘15622792660‘) time.sleep(1) # 5、find_element_by_id 通过id属性名查找 password = driver.find_element_by_id(‘TANGRAM__PSP_10__password‘) password.send_keys(‘*******‘) time.sleep(1) # 6、find_element_by_css_selector 根据属性选择器查找 # 根据id查找登录按钮 login_submit = driver.find_element_by_css_selector(‘#TANGRAM__PSP_10__submit‘) # driver.find_element_by_css_selector(‘.pass-button-submit‘) login_submit.click() # 7、find_element_by_tag_name 根据标签名称查找标签 div = driver.find_element_by_tag_name(‘div‘) print(div.tag_name) time.sleep(10) finally: driver.close()
原文:https://www.cnblogs.com/xiaohuangxiong/p/11049713.html