首页 > 其他 > 详细

datawhale爬虫task04

时间:2019-08-12 23:18:25      阅读:121      评论:0      收藏:0      [点我收藏+]

 

 

# 实战大项目:模拟登录丁香园,并抓取论坛页面所有的人员基本信息与回复帖子内容。
#
# 丁香园论坛:http://www.dxy.cn/bbs/thread/626626#626626 。
# 丁香园用户名:xxxx
# 密码:ABcd1234

from selenium import webdriver
import time
from lxml import etree
class DingxiangyuanLogin():
    def run(self):
        # 1.请求头:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
        #2. 添加请求头信息
        options = webdriver.ChromeOptions()
        options.add_argument(user-agent="ozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36")
        #3. 创建浏览器驱动
        chrome_driver = webdriver.Chrome(options=options)

        #4. 登录账号
        #4.1 访问登录界面
        url = "https://auth.dxy.cn/accounts/login?"
        chrome_driver.get(url=url)
        time.sleep(3)
        #4.2 定位电脑登录按钮:
        pc_login_selec_button = chrome_driver.find_element_by_class_name(login__tab_wp).find_elements_by_tag_name(a)[1]
        #4.3 点击进入电脑登录界面:
        pc_login_selec_button.click()
        time.sleep(3)
        #4.4 定位“用户名”输入框
        user_name_box = chrome_driver.find_element_by_name(username)
        #4.5 输入用户名:
        user_name_box.send_keys(xxxx)
        #4.6 定位“密码”输入框
        code_box = chrome_driver.find_element_by_name(password)
        #4.7 输入密码
        code_box.send_keys(ABcd1234)
        #4.8 定位登录按钮
        login_button = chrome_driver.find_element_by_class_name(form__button)
        #4.9 点击登录按钮
        login_button.click()

        #5. 访问目标帖子界面
        chrome_driver.get(http://www.dxy.cn/bbs/thread/626626#626626 )

        #获取帖子网页源代码
        reply_list = []
        response_data = chrome_driver.page_source
        #使用Xpath解析内容
        xpath_data = etree.HTML(response_data)
        # 获取所有回复节点
        # starts-with(@title,"注册时间")
        replies = xpath_data.xpath(//div[starts-with(@id, "post_")])
        print("replies: " + str(replies))
        # print(replies)
        for reply in replies:
            reply_dict = {}
            print(reply:  + str(reply))
            # 回复人姓名:
            auth_name = reply.xpath(.//div[@class="auth"])[0].xpath(string(.))
            # print(‘auth_name: ‘ + str(auth_name))
            # 级别
            auth_rank = reply.xpath(.//div[@class="info clearfix"])[0].xpath(string(.)).strip()
            print("auth_rank: " + str(auth_rank))
            # 回复内容
            reply_content = reply.xpath(.//td[@class="postbody"])[0].xpath(string(.)).strip()
            print(reply_content:  + str(reply_content))
            reply_dict[auth_name] = auth_name
            reply_dict[auth_rank] = auth_rank
            reply_dict[reply_content] = reply_content
            reply_list.append(reply_dict)




DingxiangyuanLogin().run()

 

datawhale爬虫task04

原文:https://www.cnblogs.com/tommyngx/p/11343195.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!