首页 > 其他 > 详细

学习笔记(爬虫):爬取笔趣阁小说

时间:2020-04-04 13:18:42      阅读:77      评论:0      收藏:0      [点我收藏+]
# -*- coding: utf-8 -*-
import requests
from lxml import etree

class BookSpider(object):
    def __init__(self):
        self.url = "http://www.jianlaixiaoshuo.com/"
        self.base_url = "http://www.jianlaixiaoshuo.com/"
        self.headers = {
            "Use_Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
    #请求网页内容
    def get_html(self,url):
        html = requests.get(url,headers = self.headers).content.decode()
        return html
    #封装xpath
    def get_xpath(self,html,pattern):
        p = etree.HTML(html)
        result = p.xpath(pattern)
        return result
    #保存数据
    def save_data(self, data):
        with open(剑来.txt,a,encoding=utf-8)as f:
            f.write(data)
    #下载数据
    def down_load(self,url):
        html = self.get_html(self.url)
        # print(html)
        pattern1 = //dl[@class="chapterlist"]/dd/a/@href
        pattern2 = //dl[@class="chapterlist"]/dd/a/text()
        #获取每一章的链接地址
        book_lists = self.get_xpath(html, pattern1)
        #获取每一章的章节名
        book_name_lists = self.get_xpath(html, pattern2)
        print(book_lists)
        for book_name, url in zip(book_name_lists, book_lists):
            #完整的章节url地址
            book_url = self.base_url + url
            book_html = self.get_html(book_url)
            #数据清洗
            pattern = //div[@id="BookText"]/p/text()
            book_data = self.get_xpath(book_html, pattern)
            #将列表转换为str
            book_data = ‘‘.join(book_data)
            book_data = book_data.replace(<p>,‘‘)
            book_data = book_data.replace(</p>, ‘‘)
            book_data = book_data.replace(<script type="text/javascript" src="/tb.js"></script>, ‘‘)
            book_data = book_data.replace(<br />, ‘‘)
            book_data = book_data.replace(&#8212;, ‘‘)
            book_data = book_data+\n
            book_text = book_name+\n+book_data
            print(正在下载,book_name)
            print(book_text)
            self.save_data(book_text)
    #运行程序
    def run(self):
        self.down_load(self.url)

if __name__ == "__main__":
    p = BookSpider()
    p.run()

 

学习笔记(爬虫):爬取笔趣阁小说

原文:https://www.cnblogs.com/maxxu11/p/12631126.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!