首页 > 其他 > 详细

selenium

时间:2020-05-08 00:14:33      阅读:90      评论:0      收藏:0      [点我收藏+]
- selenium模块在爬虫中的使用
    - 概念:是一个基于浏览器自动化的模块。
    - 爬虫之间的关联:
        - 便捷的捕获到动态加载到的数据。(可见即可得)
        - 实现模拟登陆
    - 环境安装:pip install selenium
    - 基本使用:
        - 准备好某一款浏览器的驱动程序:http://chromedriver.storage.googleapis.com/index.html
            - 版本的映射关系:https://blog.csdn.net/huilan_same/article/details/51896672
        - 实例化某一款浏览器对象
    - 动作链:
        - 一系列连续的动作
        - 在实现标签定位时,如果发现定位的标签是存在于iframe标签之中的,则在定位时必须执行一个
        固定的操作:bro.switch_to.frame(id)
    - 无头浏览器的操作:无可视化界面的浏览器
        - PhantomJs:停止更新
        - 谷歌无头浏览器
    - 让selenium规避检测

自动化京东搜索关键字

from selenium import webdriver
from time import sleep
bro = webdriver.Chrome(executable_path=chromedriver.exe)
bro.get(https://www.jd.com/)
sleep(1)
#进行标签定位
search_input = bro.find_element_by_id(key)
search_input.send_keys(mac pro)

btn = bro.find_element_by_xpath(//*[@id="search"]/div/div[2]/button)
btn.click()
sleep(2)

#执行js
bro.execute_script(window.scrollTo(0,document.body.scrollHeight))
sleep(2)

page_text = bro.page_source
print(page_text)

sleep(2)
bro.quit()

自动化抓取动态加载数据

from selenium import webdriver
from time import sleep
from lxml import etree
bro = webdriver.Chrome(executable_path=chromedriver.exe)

bro.get(http://125.35.6.84:81/xk/)
sleep(1)
page_text = bro.page_source
page_text_list = [page_text]

for i in range(3):
    bro.find_element_by_id(pageIto_next).click()#点击下一页
    sleep(1)
    page_text_list.append(bro.page_source)

for page_text in page_text_list:
    tree = etree.HTML(page_text)
    li_list = tree.xpath(//ul[@id="gzlist"]/li)
    for li in li_list:
        title = li.xpath(./dl/@title)[0]
        num = li.xpath(./ol/@title)[0]
        print(title+:+num)

sleep(2)
bro.quit()

动作链

from selenium import webdriver
from time import sleep
from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path=chromedriver.exe)
bro.get(https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable)
bro.switch_to.frame(iframeResult)
div_tag = bro.find_element_by_id(draggable)
#拖动= 点击+滑动
action = ActionChains(bro)
action.click_and_hold(div_tag)

for i in range(5):
    #perform让动作链立即执行
    action.move_by_offset(17,5).perform()
    sleep(0.5)

action.release()

sleep(3)

bro.quit()

12306登录

超级鹰:

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode(utf8)
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            user: self.username,
            pass2: self.password,
            softid: self.soft_id,
        }
        self.headers = {
            Connection: Keep-Alive,
            User-Agent: Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0),
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            codetype: codetype,
        }
        params.update(self.base_params)
        files = {userfile: (ccc.jpg, im)}
        r = requests.post(http://upload.chaojiying.net/Upload/Processing.php, data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            id: im_id,
        }
        params.update(self.base_params)
        r = requests.post(http://upload.chaojiying.net/Upload/ReportError.php, data=params, headers=self.headers)
        return r.json()

12306自动登录主体代码:

from selenium import webdriver
from time import sleep
from PIL import Image
from selenium.webdriver import ActionChains
from Cjy import Chaojiying_Client
from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path=chromedriver.exe)
bro.get(https://kyfw.12306.cn/otn/login/init)
sleep(5)
bro.save_screenshot(main.png)

code_img_tag = bro.find_element_by_xpath(//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img)
location = code_img_tag.location
size = code_img_tag.size
#裁剪的区域范围
rangle = (int(location[x]),int(location[y]),int(location[x]+size[width]),int(location[y]+size[height]))

i = Image.open(./main.png)
frame = i.crop(rangle)
frame.save(code.png)

def get_text(imgPath,imgType):
    chaojiying = Chaojiying_Client(bobo328410948, bobo328410948, 899370)
    im = open(imgPath, rb).read()
    return chaojiying.PostPic(im, imgType)[pic_str]

#55,70|267,133 ==[[55,70],[33,66]]
result = get_text(./code.png,9004)
all_list = []
if | in result:
    list_1 = result.split(|)
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(,)[0])
        y = int(list_1[i].split(,)[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(,)[0])
    y = int(result.split(,)[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)
print(all_list)
# action = ActionChains(bro)
for a in all_list:
    x = a[0]
    y = a[1]
    ActionChains(bro).move_to_element_with_offset(code_img_tag,x,y).click().perform()
    sleep(1)

bro.find_element_by_id(username).send_keys(123456)
sleep(1)
bro.find_element_by_id(password).send_keys(67890000000)
sleep(1)
bro.find_element_by_id(loginSub).click()

sleep(5)
bro.quit()

selenium其他操作

#使用谷歌无头浏览器
from selenium import webdriver
from time import sleep
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument(--headless)
chrome_options.add_argument(--disable-gpu)

driver = webdriver.Chrome(rchromedriver.exe,chrome_options=chrome_options)
driver.get(https://www.cnblogs.com/)
print(driver.page_source)

#如何规避selenium被检测
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from time import sleep

option = ChromeOptions()
option.add_experimental_option(excludeSwitches, [enable-automation])

driver = webdriver.Chrome(rchromedriver.exe,options=option)
driver.get(https://www.taobao.com/)

 

selenium

原文:https://www.cnblogs.com/sun-10387834/p/12846802.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!