datawhale爬虫task04

时间：2019-08-12 23:18:25 阅读：133 评论：0 收藏：0 [点我收藏+]

# 实战大项目：模拟登录丁香园，并抓取论坛页面所有的人员基本信息与回复帖子内容。
#
# 丁香园论坛：http://www.dxy.cn/bbs/thread/626626#626626 。
# 丁香园用户名：xxxx
# 密码：ABcd1234

from selenium import webdriver
import time
from lxml import etree
class DingxiangyuanLogin():
    def run(self):
        # 1.请求头：
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
        #2. 添加请求头信息
        options = webdriver.ChromeOptions()
        options.add_argument(‘user-agent="ozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"‘)
        #3. 创建浏览器驱动
        chrome_driver = webdriver.Chrome(options=options)

        #4. 登录账号
        #4.1 访问登录界面
        url = "https://auth.dxy.cn/accounts/login?"
        chrome_driver.get(url=url)
        time.sleep(3)
        #4.2 定位电脑登录按钮：
        pc_login_selec_button = chrome_driver.find_element_by_class_name(‘login__tab_wp‘).find_elements_by_tag_name(‘a‘)[1]
        #4.3 点击进入电脑登录界面：
        pc_login_selec_button.click()
        time.sleep(3)
        #4.4 定位“用户名”输入框
        user_name_box = chrome_driver.find_element_by_name(‘username‘)
        #4.5 输入用户名：
        user_name_box.send_keys(‘xxxx‘)
        #4.6 定位“密码”输入框
        code_box = chrome_driver.find_element_by_name(‘password‘)
        #4.7 输入密码
        code_box.send_keys(‘ABcd1234‘)
        #4.8 定位登录按钮
        login_button = chrome_driver.find_element_by_class_name(‘form__button‘)
        #4.9 点击登录按钮
        login_button.click()

        #5. 访问目标帖子界面
        chrome_driver.get(‘http://www.dxy.cn/bbs/thread/626626#626626 ‘)

        #获取帖子网页源代码
        reply_list = []
        response_data = chrome_driver.page_source
        #使用Xpath解析内容
        xpath_data = etree.HTML(response_data)
        # 获取所有回复节点
        # starts-with(@title,"注册时间")
        replies = xpath_data.xpath(‘//div[starts-with(@id, "post_")]‘)
        print("replies: " + str(replies))
        # print(replies)
        for reply in replies:
            reply_dict = {}
            print(‘reply: ‘ + str(reply))
            # 回复人姓名：
            auth_name = reply.xpath(‘.//div[@class="auth"]‘)[0].xpath(‘string(.)‘)
            # print(‘auth_name: ‘ + str(auth_name))
            # 级别
            auth_rank = reply.xpath(‘.//div[@class="info clearfix"]‘)[0].xpath(‘string(.)‘).strip()
            print("auth_rank: " + str(auth_rank))
            # 回复内容
            reply_content = reply.xpath(‘.//td[@class="postbody"]‘)[0].xpath(‘string(.)‘).strip()
            print(‘reply_content: ‘ + str(reply_content))
            reply_dict[‘auth_name‘] = auth_name
            reply_dict[‘auth_rank‘] = auth_rank
            reply_dict[‘reply_content‘] = reply_content
            reply_list.append(reply_dict)




DingxiangyuanLogin().run()

datawhale爬虫task04

原文：https://www.cnblogs.com/tommyngx/p/11343195.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)