java jsoup 网络爬虫 学习例子(五) 宽度优先
?
package com.iteye.injavawetrust.gethtml; import java.util.Map; import java.util.Set; /** * * @author InJavaWeTrust * */ public class GetHtml { private static JsoupUtil ju = JsoupUtil.getInstance(); public void getHtml(String url){ ju.initUnvisitedUrl(url); //待访问队列不为空,已访问<10000 while(!HtmlQueue.unVisitedUrlsEmpty() && HtmlQueue.getVisitedUrlNum() < 10000){ String visitUrl = (String) HtmlQueue.unVisitedUrlDeQueue(); if(null == visitUrl){ continue; } Map<String, String> map = ju.getHtml(visitUrl); if(0 == map.size()){ continue; } ju.getHtmlToLocal(map); //将html写如本地文件 HtmlQueue.addVisitedUrl(visitUrl); //将该URL放入到已访问的URL队列中 Set<String> links = ju.getAllUrl(visitUrl); //提取出下载网页中的URL for(String link :links){ if(!link.startsWith(Constants.URL)){ continue; } if(!ju.checkURL(link)){ continue; } // 新的未访问的 URL加入队待访问的 URL队列 HtmlQueue.addUnvisitedUrl(link); } } } public static void main(String[] args) { GetHtml gh = new GetHtml(); long starTime = System.currentTimeMillis(); gh.getHtml(Constants.URL); long endTime = System.currentTimeMillis(); System.out.println("共下载 [" + HtmlQueue.getVisitedUrlNum() + "]"); System.out.println("用时 [" + ju.msToss(endTime - starTime) + "]"); } } package com.iteye.injavawetrust.gethtml; import java.util.HashSet; import java.util.PriorityQueue; import java.util.Queue; import java.util.Set; /** * * @author InJavaWeTrust * */ public class HtmlQueue { /** * 已访问的URL队列 */ private static Set<String> visitedUrl = new HashSet<String>(); /** * 待访问的 URL队列 */ private static Queue<String> unVisitedUrl = new PriorityQueue<String>(); /** * 获得待访问URL队列 * @return */ public static Queue<String> getUnVisitedUrl() { return unVisitedUrl; } /** * 添加到访问过的URL队列中 * @param url */ public static void addVisitedUrl(String url) { visitedUrl.add(url); } /** * 移除访问过的URL * @param url */ public static void removeVisitedUrl(String url) { visitedUrl.remove(url); } /** * 未访问的URL出队列 * @return */ public static Object unVisitedUrlDeQueue() { return unVisitedUrl.poll(); } /** * 添加到待访问的 URL队列,保证每个URL只被访问一次 * @param url */ public static void addUnvisitedUrl(String url) { if (url != null && !url.trim().equals("") && !visitedUrl.contains(url) && !unVisitedUrl.contains(url)) unVisitedUrl.add(url); } /** * 获得已经访问的URL数目 * @return */ public static int getVisitedUrlNum() { return visitedUrl.size(); } /** * 判断未访问的URL队列中是否为空 * @return true-空;false-非空 */ public static boolean unVisitedUrlsEmpty() { return unVisitedUrl.isEmpty(); } } package com.iteye.injavawetrust.gethtml; /** * * @author InJavaWeTrust * */ public class Constants { public static String URL = "http://www.jqu.net.cn"; public static String HTMLPATH = "E:\\InJavaWeTrust\\jsoup\\html\\"; } package com.iteye.injavawetrust.gethtml; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TimeZone; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author InJavaWeTrust * */ public class JsoupUtil { private JsoupUtil() { } private static final JsoupUtil instance = new JsoupUtil(); public static JsoupUtil getInstance() { return instance; } /** * 初始化待访问URL队列 * @param url URL */ public void initUnvisitedUrl(String url) { HtmlQueue.addUnvisitedUrl(url); } /** * 获取URL * @param url URL * @return URL */ public Set<String> getAllUrl(String url){ Set<String> urls = new HashSet<String>(); try { Document document = Jsoup.connect(url).timeout(5000).get(); Elements hrefs = document.select("a[href]"); Iterator<Element> hrefIter = hrefs.iterator(); while (hrefIter.hasNext()) { Element href = hrefIter.next(); urls.add(href.attr("href")); } Elements srcs = document.select("img[src]"); Iterator<Element> srcIter = srcs.iterator(); while(srcIter.hasNext()){ Element src = srcIter.next(); urls.add(src.attr("src")); } Elements opts = document.select("option[value]"); Iterator<Element> optIter = opts.iterator(); while(optIter.hasNext()){ Element opt = optIter.next(); urls.add(opt.attr("value")); } Elements links = document.select("link[href]"); Iterator<Element> linkIter = links.iterator(); while(linkIter.hasNext()){ Element li = linkIter.next(); urls.add(li.attr("href")); } } catch (IOException e) { e.printStackTrace(); } return urls; } /** * 得到html内容和html名称 * @param url URL * @return map[html-内容;title-名称] */ public Map<String, String> getHtml(String url){ Map<String, String> map = new HashMap<String, String>(); try { Document document = Jsoup.connect(url).timeout(5000).get(); map.put("html", document.html()); map.put("title", url.replaceAll("/", "").replaceAll(":", "")); } catch (IOException e) { System.out.println("This is html has exception [" + url + "]"); System.out.println(e.getMessage()); } return map; } /** * URL是否以html结尾 * @param url * @return true-是;false-否 */ public boolean checkURL(String url) { String html = url.substring(url.lastIndexOf(".") + 1); return "html".equals(html) ? true : false; } /** * 将html写入本地文件 * @param htmlText html内容 * @param htmlName html名称 */ public void getHtmlToLocal(Map<String, String> map){ Writer writer = null; try { writer = new OutputStreamWriter(new FileOutputStream(new File( Constants.HTMLPATH + map.get("title"))), "UTF-8"); writer.write(map.get("html")); writer.flush(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { try { writer.close(); } catch (IOException e) { e.printStackTrace(); } } } } /** * 毫秒转换成hhmmss * @param ms 毫秒 * @return hh:mm:ss */ public String msToss(long ms) { SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss"); formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00")); String ss = formatter.format(ms); return ss; } }
?
?
?
运行结果:
?
This is html has exception [http://www.jqu.net.cn/node/1166/10483.html]
404 error loading URL http://www.jqu.net.cn/node/1166/10483.html
This is html has exception [http://www.jqu.net.cn/node/459/16310.html]
404 error loading URL http://www.jqu.net.cn/node/459/16310.html
This is html has exception [http://www.jqu.net.cn/node/459/16310.html]
404 error loading URL http://www.jqu.net.cn/node/459/16310.html
This is html has exception [http://www.jqu.net.cn/node/858/16309.html]
404 error loading URL http://www.jqu.net.cn/node/858/16309.html
共下载 [3537]
用时 [00:04:20]
?
原文:http://injavawetrust.iteye.com/blog/2280426