Python高级应用程序设计任务

时间：2019-12-15 15:18:21 阅读：98 评论：0 收藏：0 [点我收藏+]

用Python实现一个面向主题的网络爬虫程序，并完成以下内容：
（注：每人一题，主题内容自选，所有设计内容与源代码需提交到博客园平台）

一、主题式网络爬虫设计方案（15分）

1.主题式网络爬虫名称

纵横中文网
2.主题式网络爬虫爬取的内容与数据特征分析

爬取小说网完本的页数

关于该网页的小说名、作者、分类情况
3.主题式网络爬虫设计方案概述（包括实现思路与技术难点）

思路：本次设计方案主要依靠request库对目标页面进行信息的爬取采集，再用BeautifulSoup对数据进行清洗，

用find_all方法取标签，再从标签中取数据值。

技术难点：对于小说爬取内容的数据清理和窗口广告。

二、主题页面的结构特征分析（15分）
1.主题页面的结构特征

http://book.zongheng.com/store/c0/c0/b0/u0/p1/v9/s1/t0/u0/i1/ALL.html

了解纵横中文网完本url，修改不同字节获取数据

2.Htmls页面解析

div class=“bookname”爬取需要的信息 tagrget=“blank”所要的分类小说

3.节点（标签）查找方法与遍历方法

 soup = BeautifulSoup(book, "html.parser")
    #获取div
    div = soup.find_all("div", "bookname")
    #对书籍名称进行清洗
    for item in div:
        #获取书籍地址
        books.append(item.a.string)
    return books

查找方法：find_all

遍历方法：for item in div:

（必要时画出节点树结构）

三、网络爬虫程序设计（60分）
爬虫程序主体要包括以下各部分，要附源代码及较详细注释，并在每部分程序后面提供输出结果的截图。

1.数据爬取与采集

#获取页面
def gethtml(url):
    info = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36‘}
    try:
        data = requests.get(url, headers=info)
        #获取页面状态
        data.raise_for_status()
        #更改编码
        data.encoding = data.apparent_encoding
        #返回数据
        return data.text
    except:
        return " "

2.对数据进行清洗和处理

    #对书籍名称进行清洗
    for item in div:
        #获取书籍地址
        books.append(item.a.string)
    return books

    #对作者名称进行清洗
    for item in div:
        authors.append(item.a.string)
    return authors

    #对书籍类型进行清洗
    for item in div:
        type = item.find_all(‘a‘)[1].text
        type.append(type)
    return type

3.数据分析与可视化
（例如：数据柱形图、直方图、散点图、盒图、分布图、数据回归分析等）

#读文件
def read_file(url):
    try:
        data = pd.read_excel(url, names=[‘book‘, ‘author‘, ‘type‘])
        print(‘读取文件成功‘)
        return data
    except:
        "文件不存在或文件名错误"

4.数据持久化

def file(books, author, type, addr):
    # 创建Workbook
    excel = xlwt.Workbook(encoding=‘utf-8‘)
    #创建A表
    sheet1 = excel.add_sheet(u‘A‘, cell_overwrite_ok=True)
    #写入各列名
    sheet1.write(0, 0, ‘book‘)
    sheet1.write(0, 1, ‘author‘)
    sheet1.write(0, 2, ‘type‘)

    #第一列
    for i in range(1, len(books)):
        sheet1.write(i, 0, books[i])
    #第二列
    for j in range(1, len(author)):
        sheet1.write(j, 1, author[j])
    #第三列
    for z in range(1, len(type)):
        sheet1.write(z, 2, type[z])

    #保存地址
    excel.save(addr)

全部程序：

from bs4 import BeautifulSoup
import requests, xlwt
import pandas as pd


#获取页面
def gethtml(url):
    info = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36‘}
    try:
        data = requests.get(url, headers=info)
        #获取页面状态
        data.raise_for_status()
        #更改编码
        data.encoding = data.apparent_encoding
        #返回数据
        return data.text
    except:
        return " "


#书籍名称
def get_book(url):
    books = []
    #调用页面函数
    book = gethtml(url)
    soup = BeautifulSoup(book, "html.parser")
    #获取div
    div = soup.find_all("div", "bookname")
    #对书籍名称进行清洗
    for item in div:
        #获取书籍地址
        books.append(item.a.string)
    return books


#作者名称
def get_author(url):
    authors = []
    book = gethtml(url)
    soup = BeautifulSoup(book, "html.parser")
    #获取属性为tit的p标签
    div = soup.find_all("div", "bookilnk")
    #对作者名称进行清洗
    for item in div:
        authors.append(item.a.string)
    return authors


#书籍类型
def get_type(url):
    type = []
    book = gethtml(url)
    soup = BeautifulSoup(book, "html.parser")
    #获取div
    div = soup.find_all("div", "bookilnk")
    #对书籍类型进行清洗
    for item in div:
        type = item.find_all(‘a‘)[1].text
        type.append(type)
    return type


def file(books, author, type, addr):
    # 创建Workbook
    excel = xlwt.Workbook(encoding=‘utf-8‘)
    #创建A表
    sheet1 = excel.add_sheet(u‘A‘, cell_overwrite_ok=True)
    #写入各列名
    sheet1.write(0, 0, ‘book‘)
    sheet1.write(0, 1, ‘author‘)
    sheet1.write(0, 2, ‘type‘)

    #第一列
    for i in range(1, len(books)):
        sheet1.write(i, 0, books[i])
    #第二列
    for j in range(1, len(author)):
        sheet1.write(j, 1, author[j])
    #第三列
    for z in range(1, len(type)):
        sheet1.write(z, 2, type[z])

    #保存地址
    excel.save(addr)


#读文件
def read_file(url):
    try:
        data = pd.read_excel(url, names=[‘book‘, ‘author‘, ‘type‘])
        print(‘读取文件成功‘)
        return data
    except:
        "文件不存在或文件名错误"


#主函数
def main():
    for i in range(1, 4):
        url = ‘http://book.zongheng.com/store/c0/c0/b0/u0/p{}/v9/s1/t0/u0/i1/ALL.html‘.format(i)
        #获取书名
        books = get_book(url)
        #获取作者名
        authors = get_author(url)
        #获取书籍类型
        types = get_type(url)
        addr = ‘D:{}.xls‘.format(i)
        #写入文件
        file(books, authors, types, addr)
    #读取多个数据文件
    for i in range(1, 4):
        addr = ‘D:{}.xls‘.format(i)
        data = read_file(addr)
        print(data)


if __name__ == ‘__main__‘:
    main()