首页 > 编程语言 > 详细

python爬虫-搜索小说并下载

时间:2019-10-12 14:56:32      阅读:247      评论:0      收藏:0      [点我收藏+]
  1 #coding:utf-8
  2 import requests,os,re
  3 from bs4 import BeautifulSoup
  4 from selenium import webdriver
  5 from selenium.webdriver.chrome.options import Options
  6 from selenium.webdriver.common.keys import Keys
  7 
  8 class downloader():
  9 
 10     def __init__(self):
 11         self.urls = []  # 保存章节链接
 12         self.name = []  # 保存章节名
 13         self.url = https://so.biqusoso.com/s.php?ie=utf-8&siteid=biqugex.com&q=
 14 
 15     """输入小说名,搜索"""
 16     def Get_url(self):
 17         #创建chrome参数对象,设置chrome浏览器无界面模式
 18         chrome_options = Options()
 19         chrome_options.add_argument(--headless)
 20         # 创建chrome无界面对象
 21         browser = webdriver.Chrome(options=chrome_options)
 22         browser.get(self.url)
 23         c = input(请输入小说全名:)
 24         browser.find_element_by_xpath(//*[@id="wrapper"]/div[1]/div[2]/form/input[3]).send_keys(c)
 25         browser.find_element_by_xpath(//*[@id="wrapper"]/div[1]/div[2]/form/input[4]).click()
 26         new_url = browser.current_url
 27         # 关闭浏览器
 28         browser.close()
 29         # 关闭chromedriver进程
 30         browser.quit()
 31         print("已关闭浏览器")
 32         # print(new_url)
 33         response = requests.get(new_url)
 34         response.encoding = utf-8
 35         soup = BeautifulSoup(response.text, lxml)
 36         # print(soup)
 37         name1 = soup.find_all(span, class_=s2)
 38         soup = BeautifulSoup(str(name1), lxml)
 39         new_name = soup.find(a)
 40         new_name1 = new_name.string
 41         # print(new_name1)
 42         self.href = new_name.attrs[href]
 43         print(self.href)
 44         return self.href
 45     def Response(self):
 46         response = requests.get(self.href)
 47         response.encoding = gbk  # 解决乱码
 48         self.soup = BeautifulSoup(response.text, lxml)  # 解析网页
 49         div = self.soup.find_all(div, class_=listmain)  # 在解析结果中查找class_=‘listmain‘
 50         soup1 = BeautifulSoup(str(div), lxml)  # 删除字符串头和尾的空格
 51         h = soup1.find_all(a)  # 在class_=‘listmain下面找到a标签
 52         for i in h:
 53             self.name.append(i.string)  # 将a标签中的非属性字符,即章节名添加到name
 54             self.urls.append(https://www.biqugex.com%s % i.get(href))  # 将a标签中的链接,添加到urls
 55 
 56     def file(self):
 57         """查找小说名字,并创建同名文件夹"""
 58         div1 = self.soup.select(body > div.book > div.info > h2)
 59         a = BeautifulSoup(str(div1), lxml)
 60         b = a.find(h2)
 61         b = b.string
 62         c = C:\\Users\\Administrator\\Desktop\\%s % b
 63         if not os.path.exists(c):
 64             os.mkdir(c)
 65 
 66         # 循环解析urls,得到小说正文
 67         i = 0
 68         while i < len(self.urls):
 69             response1 = requests.get(url=self.urls[i])
 70             response1.encoding = gbk
 71             soup2 = BeautifulSoup(response1.text, lxml)
 72             d = soup2.find_all(div, id=content)
 73             id1 = BeautifulSoup(str(d), lxml)
 74             # 创建文件名
 75             src = self.name[i] + .txt
 76             filename = c + / + src
 77             print(filename)
 78 
 79             # 将解析到的小说正文写到文件中
 80             for result in id1:
 81                 res = result.text
 82                 id2 = soup2.select(#content)
 83                 with open(filename, w+, encoding=utf-8) as f:
 84                     f.write(res)
 85                 i += 1
 86 #如果输入的网址不是正确的网址,则提示请输入正确的笔趣阁网址
 87     def Main(self):
 88         try:
 89             d = downloader()
 90             d.Get_url()
 91         except:
 92             print(没有找到)
 93         else:
 94             d.Response()
 95             d.file()
 96 
 97 
 98 
 99 if __name__ == __main__:
100     # url=input(‘请输入网址:‘)
101     # url=‘https://www.biqugex.com/book_104027/‘
102     a = downloader()
103     a.Main()

 

python爬虫-搜索小说并下载

原文:https://www.cnblogs.com/hfct/p/11661063.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!