爬虫案例-使用selenium模拟点击动态页面

时间：2021-04-21 10:14:50 阅读：23 评论：0 收藏：0 [点我收藏+]

爬取斗鱼上正在直播的主播名、直播分区、直播标题以及直播热度等信息，以jsonlines的形式写入到本地json文件中，代码如下:

# coding:utf-8
import unittest
import json

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import Chrome,ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

class DouyuSpider(unittest.TestCase):
    def setUp(self):
        options = ChromeOptions()
        options.add_argument(‘--headless‘)
        options.add_argument(
            ‘user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36‘)
        options.add_experimental_option(‘useAutomationExtension‘, False)
        options.add_experimental_option(‘excludeSwitches‘, [‘enable-automation‘])
        options.add_argument("disable-blink-features=AutomationControlled")
        prefs = {‘profile.default_content_setting_values‘: {‘notifications‘: 2}}
        options.add_experimental_option(‘prefs‘, prefs)
        self.driver = Chrome(options=options)
        self.url = ‘https://www.douyu.com/directory/all‘
        self.f = open(‘douyu.json‘,‘a‘)

    def testDouyu(self):
        self.driver.get(self.url)
        while True:
            next = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,‘dy-Pagination-item-custom‘)))
            soup = BeautifulSoup(self.driver.page_source,‘lxml‘)
            elements = soup.select(‘div[class="layout-Module-container layout-Cover ListContent"] li[class="layout-Cover-item"]‘)
            for element in elements:
                user = element.find_all(‘div‘,{"class":"DyListCover-userName"})[0].text #主播
                zone = element.find_all(‘span‘,{"class":"DyListCover-zone"})[0].text #直播分区
                title = element.find_all(‘h3‘,{"class":"DyListCover-intro"})[0].text #直播标题
                hot = element.find_all(‘span‘,{"class":"DyListCover-hot"})[0].text #直播热度
                douyu = {
                    ‘username‘:user,
                    ‘zone‘:zone,
                    ‘title‘:title,
                    ‘hot‘:hot
                }
                json_data = json.dumps(douyu,ensure_ascii=False)
                self.f.write(json_data+‘\n‘)
            if self.driver.page_source.find(‘dy-Pagination-disabled dy-Pagination-next‘) != -1:
                break
            next.click()

    def tearDown(self):
        self.driver.quit()
        self.f.close()
        print(‘执行结束‘)

if __name__ == ‘__main__‘:
    unittest.main()

原文：https://www.cnblogs.com/eliwang/p/14683624.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)