soup = BeautifulSoup(book, "html.parser") #获取div div = soup.find_all("div", "bookname") #对书籍名称进行清洗 for item in div: #获取书籍地址 books.append(item.a.string) return books
查找方法:find_all
遍历方法:for item in div:
#获取页面 def gethtml(url): info = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36‘} try: data = requests.get(url, headers=info) #获取页面状态 data.raise_for_status() #更改编码 data.encoding = data.apparent_encoding #返回数据 return data.text except: return " "
#对书籍名称进行清洗 for item in div: #获取书籍地址 books.append(item.a.string) return books #对作者名称进行清洗 for item in div: authors.append(item.a.string) return authors #对书籍类型进行清洗 for item in div: type = item.find_all(‘a‘)[1].text type.append(type) return type
#读文件 def read_file(url): try: data = pd.read_excel(url, names=[‘book‘, ‘author‘, ‘type‘]) print(‘读取文件成功‘) return data except: "文件不存在或文件名错误"
def file(books, author, type, addr): # 创建Workbook excel = xlwt.Workbook(encoding=‘utf-8‘) #创建A表 sheet1 = excel.add_sheet(u‘A‘, cell_overwrite_ok=True) #写入各列名 sheet1.write(0, 0, ‘book‘) sheet1.write(0, 1, ‘author‘) sheet1.write(0, 2, ‘type‘) #第一列 for i in range(1, len(books)): sheet1.write(i, 0, books[i]) #第二列 for j in range(1, len(author)): sheet1.write(j, 1, author[j]) #第三列 for z in range(1, len(type)): sheet1.write(z, 2, type[z]) #保存地址 excel.save(addr)
from bs4 import BeautifulSoup import requests, xlwt import pandas as pd #获取页面 def gethtml(url): info = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36‘} try: data = requests.get(url, headers=info) #获取页面状态 data.raise_for_status() #更改编码 data.encoding = data.apparent_encoding #返回数据 return data.text except: return " " #书籍名称 def get_book(url): books = [] #调用页面函数 book = gethtml(url) soup = BeautifulSoup(book, "html.parser") #获取div div = soup.find_all("div", "bookname") #对书籍名称进行清洗 for item in div: #获取书籍地址 books.append(item.a.string) return books #作者名称 def get_author(url): authors = [] book = gethtml(url) soup = BeautifulSoup(book, "html.parser") #获取属性为tit的p标签 div = soup.find_all("div", "bookilnk") #对作者名称进行清洗 for item in div: authors.append(item.a.string) return authors #书籍类型 def get_type(url): type = [] book = gethtml(url) soup = BeautifulSoup(book, "html.parser") #获取div div = soup.find_all("div", "bookilnk") #对书籍类型进行清洗 for item in div: type = item.find_all(‘a‘)[1].text type.append(type) return type def file(books, author, type, addr): # 创建Workbook excel = xlwt.Workbook(encoding=‘utf-8‘) #创建A表 sheet1 = excel.add_sheet(u‘A‘, cell_overwrite_ok=True) #写入各列名 sheet1.write(0, 0, ‘book‘) sheet1.write(0, 1, ‘author‘) sheet1.write(0, 2, ‘type‘) #第一列 for i in range(1, len(books)): sheet1.write(i, 0, books[i]) #第二列 for j in range(1, len(author)): sheet1.write(j, 1, author[j]) #第三列 for z in range(1, len(type)): sheet1.write(z, 2, type[z]) #保存地址 excel.save(addr) #读文件 def read_file(url): try: data = pd.read_excel(url, names=[‘book‘, ‘author‘, ‘type‘]) print(‘读取文件成功‘) return data except: "文件不存在或文件名错误" #主函数 def main(): for i in range(1, 4): url = ‘http://book.zongheng.com/store/c0/c0/b0/u0/p{}/v9/s1/t0/u0/i1/ALL.html‘.format(i) #获取书名 books = get_book(url) #获取作者名 authors = get_author(url) #获取书籍类型 types = get_type(url) addr = ‘D:{}.xls‘.format(i) #写入文件 file(books, authors, types, addr) #读取多个数据文件 for i in range(1, 4): addr = ‘D:{}.xls‘.format(i) data = read_file(addr) print(data) if __name__ == ‘__main__‘: main()
运行结果:
原文:https://www.cnblogs.com/szh123/p/12043965.html