const cheerio = require(‘cheerio‘);
const axios = require(‘axios‘)
const fs = require(‘fs‘)
const path = require(‘path‘)
let httpUrL = ‘https://www.doutula.com/article/list/?page=1‘
function sleep(number){
var now = new Date();
var exitTime = now.getTime() + number * 1000;
while (true) {
now = new Date();
if(now.getTime() > exitTime)
return
}
}
// 获取页面总数
async function geteNum(){
let res = await axios.get(httpUrL)
let $ = cheerio.load(res.data);
let btnLength = $(‘.pagination li‘).length;
let allNum = $(‘.pagination li‘).eq(btnLength -2).find(‘a‘).text();
// console.log(allNum);
return allNum
}
async function spider(){
// 获取所有页面总数
let allPageNum = await geteNum();
allPageNum = 3;
for(let i=0; i<= allPageNum;i++){
getListPage(i)
sleep(3)
}
}
async function getListPage(pageNum){
let httpUrL = ‘https://www.doutula.com/article/list/?page=‘ + pageNum;
let res = await axios.get(httpUrL);
// cheerio 解析 html 文档
let $ = cheerio.load(res.data);
$(‘#home > div > div.col-sm-9>a‘).each((i,element)=>{
// console.log($(element).attr(‘href‘));
let pageUrl = $(element).attr(‘href‘);
let title = $(element).find(‘.random_title‘).text();
// 正则过滤数据
let reg = /(.*?)\d/igs
title = reg.exec(title)[1]
// 创建文件夹
fs.mkdir(‘./img/‘ + title,(err)=>{
if(err) return
})
parsePage(pageUrl,title)
})
}
async function parsePage(url,title){
let res = await axios.get(url);
let $ = cheerio.load(res.data);
$(‘.pic-content img‘).each(function(i,element){
let imgUrl = $(element).attr(‘src‘)
// 获取后缀名
let extname = path.basename(imgUrl);
// 图片写入的路径和名字
let imgPath = `./img/${title}/${extname}`
// 创建写入流
let ws = fs.createWriteStream(imgPath)
axios.get(imgUrl,{responseType:‘stream‘}).then((res)=>{
res.data.pipe(ws)
// 关闭写入流
res.data.on(‘close‘,function(){
ws.close()
})
})
})
}
spider();