最近迷上了三体小说,网上小说基本上都是分章节一篇一篇的人肉ctrl c v实在是太low了。干脆自己写个脚本吧,一劳永逸。
基本思路:
实现如下:
"""
===================================
-*- coding:utf-8 -*-
Author :GadyPu
E_mail :Gadypy@gmail.com
Time :2020/10/7 0007 上午 11:59
FileName :spider.py
===================================
"""
import os
import re
import sys
import requests
from lxml import etree
import asyncio
import aiohttp
from queue import Queue
import threading
class GetNovels(object):
def __init__(self, url, name):
self.headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36‘
}
self.novel_url = url
self.htmlQ = asyncio.Queue()
self.chapterQ = Queue()
self.novel_name = name
def get_chapter_urls(self):
try:
response = requests.get(url = self.novel_url, headers = self.headers)
html = etree.HTML(response.content.decode(‘utf-8‘))
titles = html.xpath(r‘//div[@class="book-list clearfix"]/ul/li/a/text()‘)
links = html.xpath(r‘//div[@class="book-list clearfix"]/ul/li/a/@href‘)
for title, link in zip(titles, links):
self.htmlQ.put_nowait((title, link))
except Exception as e:
print(e, ‘\n‘, ‘network error cannot parse chapter url‘)
sys.exit()
async def fetch(self):
try:
async with aiohttp.ClientSession(connector = aiohttp.TCPConnector(ssl = False)) as session:
while not self.htmlQ.empty():
data = await self.htmlQ.get()
async with session.get(url = data[1], headers = self.headers) as response:
if response.status == 200:
html = await response.read()
self.chapterQ.put((data[0], html.decode(‘utf-8‘)))
await asyncio.sleep(0.3)
except Exception as e:
print(e, ‘\n‘, ‘network error cannot fetch chapters...‘)
sys.exit()
def parse_chapter(self, path, id):
while True:
data = self.chapterQ.get()
if not data:
break
temp = data[0].split(‘ ‘)
html = etree.HTML(data[1])
# 获取p标签下所有文本
content = html.xpath(r‘//*[@id="nr1"]/p//text()‘)
chapter = html.xpath(r‘//*[@id="bcrumb"]/span[5]/a/text()‘)[0]
chapter_dir = os.path.join(path, chapter)
if not os.path.exists(chapter_dir):
os.makedirs(chapter_dir)
chapter_name = os.path.join(chapter_dir, re.sub(‘[\/:*?"<>|]‘, ‘-‘, ‘ ‘.join(temp)))
print(f‘thread:{id} is parsing: ‘ + ‘ ‘.join(temp))
with open(chapter_name + ‘.txt‘, ‘w+‘, encoding = ‘utf-8‘) as wf:
wf.write(‘ ‘.join(temp) + ‘\n\n‘)
for cont in content:
wf.write(str(cont) + ‘\n‘)
def run(self):
self.get_chapter_urls()
loop = asyncio.get_event_loop()
# 为了防止爬的过快控制并发数量
tasks = [self.fetch() for _ in range(20)]
path = os.path.join(os.getcwd(), self.novel_name)
if not os.path.exists(path):
os.makedirs(path)
thread_lists = []
for i in range(3):
t = threading.Thread(target = self.parse_chapter, args = (path, i + 1))
t.setDaemon(True)
thread_lists.append(t)
t.start()
loop.run_until_complete(asyncio.wait(tasks))
[self.chapterQ.put_nowait(None) for _ in range(3)]
[i.join() for i in thread_lists]
if __name__ == ‘__main__‘:
url = ‘https://www.luoxia.com/santi/‘
name = ‘三体‘
d = GetNovels(url, name)
d.run()
原文:https://www.cnblogs.com/GadyPu/p/13777270.html