express官网:http://www.expressjs.com.cn/
npm install express --save
npm install express-generator -g
express myapp
cd myapp
npm install
superagent官网:http://visionmedia.github.io/superagent/
npm install superagent
cheerio官网:https://cheerio.js.org/
npm install cheerio
var express = require("express");
const cheerio = require('cheerio');
const superagent = require('superagent');
var router = express.Router();
router.get('/', function (req, res, next) {
// 抓取内容
superagent.get('http://www.donews.com/')
.end(function (err, sres) {
if (err) {
return next(err);
}
var $ = cheerio.load(sres.text);
var items = [];
$('div.block h3.block a').each(function (idx, element) {
var $element = $(element);
items.push({
title: $element.text(),
href: $element.attr('href')
});
});
res.send(items);
});
});
module.exports = router;
superagent.get(‘抓取网页的地址‘)
网页的 html 内容存储在 sres.text 里面
用 cheerio.load 加载得到的html内容并赋给变量 $
后面选择需要的内容部分语法和jQuery选择器基本一致,选择需要的元素进行遍历
然后返回遍历的内容
var createError = require('http-errors');
var express = require('express');
var path = require('path');
var cookieParser = require('cookie-parser');
var logger = require('morgan');
var newsRouter = require('./routes/news');
var app = express();// 创建实例
var myLogger = function (req, res, next) {
console.log('LOGGED');
next();
}
var requestTime = function (req, res, next) {
req.requestTime = Date.now();
console.log(req.requestTime);
next();
}
// view engine setup
app.set('views', path.join(__dirname, 'views'));
app.set('view engine', 'pug');
app.use(logger('dev'));
app.use(express.json());
app.use(express.urlencoded({ extended: false }));
app.use(cookieParser());
app.use(express.static(path.join(__dirname, 'public')));// 将 public 目录下的图片、CSS 文件、JavaScript 文件对外开放访问(此写法为绝对路径)
app.use(myLogger);
app.use(requestTime);
app.use('/news', newsRouter);
//设置跨域请求
app.use('*', function (req, res, next) {
res.header("Access-Control-Allow-Origin", "*");
res.header('Access-Control-Allow-Headers', 'Content-Type, Content-Length, Authorization, Accept, X-Requested-With , yourHeaderFeild');
res.header("Access-Control-Allow-Methods", "PUT,POST,GET,DELETE,OPTIONS");
res.header("X-Powered-By", ' 3.2.1')
res.header("Content-Type", "application/json;charset=utf-8");
next();
});
// catch 404 and forward to error handler
app.use(function(req, res, next) {
next(createError(404));
});
// error handler
app.use(function(err, req, res, next) {
// set locals, only providing error in development
res.locals.message = err.message;
res.locals.error = req.app.get('env') === 'development' ? err : {};
// render the error page
res.status(err.status || 500);
res.render('error');
});
module.exports = app;
引入路由的代码:
var newsRouter = require(‘./routes/news‘);
app.use(‘/news‘, newsRouter);
npm start
浏览器打开项目即可看到爬取的数据
原文:https://www.cnblogs.com/lpkshuai/p/11811648.html