基本思路:
1、初始化一个或者多个入口链接为初始状态到链接表
2、爬虫爬取的黑名单和白名单,只有匹配白名单且不再匹配黑名单中的链接才能通过
3、从链接表中取链接并置为下载状态,下载该链接的网页
4、把下载到的网页插入到内容表
5、从获取的网页中解析出链接,根据2中的规则过滤不需要的链接,把需要的链接以初始状态插入到连接表
6、把该链接置为已下载状态
然后循环步骤3、4、5、6,如果步骤3下载失败,则链接处于下载状态,并跳过该链接继续循环
代码实现
黑白名单
package com.data;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author yun
* @date 2015年3月25日
* @time 上午11:01:57
* @todo 黑白名单
*
*/
public class Regex {
private List<String> blackList = new ArrayList<String>();
private List<String> whiteList = new ArrayList<String>();
public Regex(String blackPath, String whitePath) {
try (FileInputStream fis = new FileInputStream(blackPath);
InputStreamReader isr = new InputStreamReader(fis);
BufferedReader br = new BufferedReader(isr)) {
String line;
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.length() == 0) {
continue;
}
blackList.add(line);
}
} catch (Exception e) {
System.out.println("读取黑名单出现异常:" + e.getMessage());
}
try (FileInputStream fis = new FileInputStream(whitePath);
InputStreamReader isr = new InputStreamReader(fis);
BufferedReader br = new BufferedReader(isr)) {
String line;
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.length() == 0) {
continue;
}
whiteList.add(line);
}
} catch (Exception e) {
System.out.println("读取黑名单出现异常:" + e.getMessage());
}
}
public List<String> getBlackList() {
return blackList;
}
public void setBlackList(List<String> blackList) {
this.blackList = blackList;
}
public List<String> getWhiteList() {
return whiteList;
}
public void setWhiteList(List<String> whiteList) {
this.whiteList = whiteList;
}
}正则匹配
package com.data;
/**
*
* @author yun
* @date 2015年3月25日
* @time 上午11:02:01
* @todo 正则匹配
*
*/
public class Global {
public static boolean regex(String url, Regex regex) {
for (String black : regex.getBlackList()) {
if (!url.matches(black)) {
continue;
}
return false;
}
for (String white : regex.getWhiteList()) {
if (!url.matches(white)) {
continue;
}
return true;
}
return false;
}
}爬虫类
package com.data;
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.Date;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.data.util.Hash;
import com.data.util.ZLib;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.MongoClient;
/**
*
* @author yun
* @date 2015年3月25日
* @time 上午10:54:49
* @todo 爬虫
*
*/
public class Spider {
private BasicDBObject update = new BasicDBObject("$set", new BasicDBObject("status", 1));
private BasicDBObject query = new BasicDBObject("status", 0);
private MongoClient server;
private Regex regex;
public static void main(String[] args) {
try {
new Spider().execute();
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
public void execute() throws InterruptedException {
init();
Thread[] threads = new Thread[3];
for (int x = 0; x < threads.length; x++) {
threads[x] = new Thread(new Crawl());
threads[x].start();
}
for (int x = 0; x < threads.length; x++) {
threads[x].join();
}
server.close();
System.out.println("end");
}
private void init() {
try {
server = new MongoClient("192.168.16.215");
} catch (UnknownHostException e) {
System.out.println(e.getMessage());
return;
}
loadConfig();
}
public synchronized void loadConfig() {
String blackPath = "D:/360pan/eclipse/workspace/Spider/bin/black";
String whitePath = "D:/360pan/eclipse/workspace/Spider/bin/white";
regex = new Regex(blackPath, whitePath);
}
private void analysisUrls(Document doc) {
Elements select = doc.select("a[href]");
for (Element link : select) {
String url = link.absUrl("href");
if (!Global.regex(url, regex)) {
continue;
}
saveUrl(url);
}
}
private void saveUrl(String url) {
if (url.contains("#")) {
url = url.substring(0, url.indexOf("#"));
}
DBCollection collection = server.getDB("db").getCollection("url");
BasicDBObject doc = new BasicDBObject();
doc.append("url", url);
doc.append("md5", Hash.getMd5String(url.getBytes()));
doc.append("status", 0);
doc.append("date", new Date());
try {
collection.insert(doc);
} catch (Exception e) {
return;
}
}
class Crawl implements Runnable {
@Override
public void run() {
DBCollection collection = server.getDB("db").getCollection("url");
while (true) {
DBObject find = collection.findAndModify(query, update);
if (find == null) {
break;
}
String url = find.get("url").toString();
Connection connect = Jsoup.connect(url).timeout(3000).followRedirects(true);
Document doc = null;
try {
doc = connect.get();
} catch (IOException e) {
System.out.println("crawl >> " + url + " >> " + e.getMessage());
continue;
}
System.out.println("crawl >> " + url);
commitUrl(url);
analysisUrls(doc);
commitContent(doc.html(), url);
}
}
}
private void commitUrl(String url) {
DBCollection collection = server.getDB("db").getCollection("url");
BasicDBObject query = new BasicDBObject();
query.put("url", url);
BasicDBObject update = new BasicDBObject();
BasicDBObject modify = new BasicDBObject();
modify.put("status", 2);
update.put("$set", modify);
collection.update(query, update, true, true);
}
private void commitContent(String content, String url) {
try {
DBCollection collection = server.getDB("db").getCollection("content");
BasicDBObject doc = new BasicDBObject();
doc.append("url", url);
doc.append("data", ZLib.compress(content.getBytes("UTF-8")));
doc.append("md5", Hash.getMd5String(url.getBytes()));
doc.append("date", new Date());
collection.insert(doc);
} catch (Exception e) {
return;
}
}
}黑名单
.*# mailto.* .*.pdf
白名单
http://cjrb.cjn.cn/.*
初始化表和入口
package com.data;
import java.net.UnknownHostException;
import java.util.Date;
import com.data.util.Hash;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.MongoClient;
public class Init {
public static void initUrlCollection(MongoClient server) {
DBCollection collection = server.getDB("db").getCollection("url");
BasicDBObject url_ = new BasicDBObject();
url_.put("url", 1);
collection.ensureIndex(url_, "url_", true);
BasicDBObject status_ = new BasicDBObject();
status_.put("status", 1);
collection.ensureIndex(status_, "status_");
}
public static void initContentCollection(MongoClient server) {
DBCollection collection = server.getDB("db").getCollection("content");
BasicDBObject url_ = new BasicDBObject();
url_.put("url", 1);
collection.ensureIndex(url_, "url_", true);
}
public static void initEntry(MongoClient server) {
//长江日报
String url = "http://cjrb.cjn.cn/html/2015-03/25/node_2.htm";
DBCollection collection = server.getDB("db").getCollection("url");
BasicDBObject entry = new BasicDBObject();
entry.put("url", url);
entry.put("status", 0);
entry.put("md5", Hash.getMd5String(url.getBytes()));
entry.put("date", new Date());
collection.insert(entry);
}
public static void main(String[] args) throws UnknownHostException {
MongoClient server = new MongoClient("192.168.16.215");
initUrlCollection(server);
initContentCollection(server);
initEntry(server);
}
}初始化后,运行爬虫类
运行情况如下

其中有尝试爬取pdf,后来在黑名单中添加过滤了pdf就不会爬取pdf了
数据库中数据效果如下

原文:http://my.oschina.net/u/1240328/blog/391699