首页 > 其他 > 详细

爬取四大名著

时间:2019-07-06 17:39:35      阅读:183      评论:0      收藏:0      [点我收藏+]
‘‘‘
 诗词名句网
 1. 爬取固定书籍
 2. 爬取书名
 3. 爬取本部书的章回目录
 4. 灵活处理,爬取任意书籍的章回目录
 5. 加入异常处理
 6. 爬取任意整本书
‘‘‘

import requests
import re

def bookSpider(oldurl,bookName):
    url=oldurl+".html"
    html=loadPage(url)
    try:
        with open("demo.txt",w,encoding=utf-8) as f:
            f.write(html)
    except:
        print("FILE OPERATION ERROR")
    findTitle("demo.txt",bookName)
    cnt=findTileOfPages("demo.txt",bookName)
    getWholeBook(oldurl,bookName,cnt)

def findTitle(filename,bookName):
    try:
        f=open(filename,encoding=utf-8)
        book=open("book.txt",w,encoding=utf-8)
    except:
        print("FILE OPERATION ERROR")
    while True:
        line=f.readline()
        #print("READ:"+line)
        if not line:
            break
        pattern=re.compile(r<title>《.{0,10}》)
        bookName=re.search(pattern,line)
        flag=False
        if bookName:
            print("书名:",end="")
            for ch in str(bookName):
                if ch == :
                    flag=True
                if ch == :
                    flag=False
                    print("")
                    book.write(+\n)
                if flag:
                    print(ch,end="")
                    book.write(ch)

def findTileOfPages(filename,bookName):
    cnt=0
    try:
        f = open(filename,encoding=utf-8)
        book = open("book.txt",a, encoding=utf-8)
    except:
        print("FILE OPERATION ERROR")
    book.write("目录:\n")
    while True:
        line = f.readline()
        # print("READ:"+line)
        if not line:
            break
        pattern = re.compile(r<li><a href="/book/+bookName+/\d+.html">.{10,40}</a></li>)
        titleOfpages = pattern.findall(line)
        flag = False
        if titleOfpages:
            for i in range(0,len(titleOfpages)):
                cnt+=1
                for j in range(0,len(titleOfpages[i])):
                    if titleOfpages[i][j] == :
                        flag=True
                    if titleOfpages[i][j] == <:
                        flag=False
                    if flag:
                        print(titleOfpages[i][j],end="")
                        book.write(titleOfpages[i][j])
                print()
                book.write(\n)
    return cnt

def getWholeBook(url,bookName,cnt):
    print("正在下载全本书,请稍后...")
    for i in range(1,cnt+1):
        newUrl=url+/+str(i)+".html"
        print(newUrl)
        html=loadPage(newUrl)
        try:
            with open("bookHtml.txt", w, encoding=utf-8) as f:
                f.write(html)
        except:
            print("FILE OPERATION ERROR")
        f = open(bookHtml.txt, r, encoding=utf-8)
        bookContent = open(book.txt, a, encoding=utf-8)
        while True:
            line = f.readline()
            # print("READ:"+line)
            if not line:
                break
            pattern = re.compile(r<p>&nbsp;&nbsp;&nbsp;&nbsp;.+</p>)
            content = re.findall(pattern, line)
            patternOfTitle=re.compile(r<h1>.+</h1>)
            contentOfTitle = re.findall(patternOfTitle, line)
            flag=False
            for i in range(0, len(contentOfTitle)):
                for j in range(0, len(contentOfTitle[i])):
                    if contentOfTitle[i][j] == >:
                        flag=True
                        continue
                    if contentOfTitle[i][j] == <:
                        flag=False
                        continue
                    if flag:
                        bookContent.write(contentOfTitle[i][j])
                bookContent.write(\n)

            flag = False
            for i in range(0, len(content)):
                for j in range(0, len(content[i])):
                    if content[i][j] == <:
                        flag=False
                        continue
                    if content[i][j] == ; and content[i][j - 1] == p and content[i][j + 1] != &:
                        flag = True
                        continue
                    if flag:
                        bookContent.write(content[i][j])
                bookContent.write(\n)
        f.close()
        bookContent.close()

def loadPage(url):
    try:
        header = {User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36}
        response = requests.get(url, headers=header)
        return response.content.decode(utf-8)
    except:
        print("PAGE LOAD ERROR")

if __name__ == "__main__":
    bookName=input("请输入想看的书名:(全拼)")
    url = "http://www.shicimingju.com/book/"+bookName
    bookSpider(url,bookName)

 

爬取四大名著

原文:https://www.cnblogs.com/TheSilverMoon/p/11143203.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!