首页 > 编程语言 > 详细

Java正则表达式获取网页所有网址和链接文字

时间:2014-03-21 12:46:58      阅读:492      评论:0      收藏:0      [点我收藏+]

/*获取网址首页的所有网址和链接文字*/
bubuko.com,布布扣
bubuko.com,布布扣
bubuko.com,布布扣import java.io.BufferedReader;
bubuko.com,布布扣import java.io.IOException;
bubuko.com,布布扣import java.io.InputStreamReader;
bubuko.com,布布扣import java.net.MalformedURLException;
bubuko.com,布布扣import java.net.URL;
bubuko.com,布布扣import java.util.ArrayList;
bubuko.com,布布扣import java.util.HashMap;
bubuko.com,布布扣import java.util.List;
bubuko.com,布布扣import java.util.regex.Matcher;
bubuko.com,布布扣import java.util.regex.Pattern;
bubuko.com,布布扣
bubuko.com,布布扣
bubuko.com,布布扣
bubuko.com,布布扣import java.net.*;
bubuko.com,布布扣import java.io.*;
bubuko.com,布布扣import java.util.regex.*;
bubuko.com,布布扣
bubuko.com,布布扣/*
bubuko.com,布布扣根据指定的规则,通过构造正则表达式获取网址
bubuko.com,布布扣
*/

bubuko.com,布布扣
bubuko.com,布布扣
public class Urls
bubuko.com,布布扣
{
bubuko.com,布布扣    
private String startUrl;                                         //开始采集网址
bubuko.com,布布扣
    String  urlContent;
bubuko.com,布布扣    String ContentArea;
bubuko.com,布布扣    
private String strAreaBegin ,strAreaEnd ;            //采集区域开始采集字符串和结束采集字符串
bubuko.com,布布扣
    private String stringInUrl,stringNotInUrl;        
bubuko.com,布布扣    String strContent;
//获得的采集内容
bubuko.com,布布扣
    String[] allUrls;                                                            //采集到的所有网址
bubuko.com,布布扣
    private String  regex;                                                 //采集规则
bubuko.com,布布扣
    
bubuko.com,布布扣    UrlAndTitle   urlAndTitle
=new UrlAndTitle();    //存储网址和标题                    
bubuko.com,布布扣
    
bubuko.com,布布扣    
bubuko.com,布布扣    
public static void main(String[] args)
bubuko.com,布布扣    
{
bubuko.com,布布扣         Urls myurl
=new Urls("<body","/body>");
bubuko.com,布布扣         myurl.getStartUrl("http://www.zuzwn.com/");

bubuko.com,布布扣         myurl.getUrlContent();
bubuko.com,布布扣         myurl.getContentArea();
bubuko.com,布布扣         myurl.getStartUrl("http://www.zuzwn.com/");

bubuko.com,布布扣         myurl.getStringNotInUrl(
"google");
bubuko.com,布布扣         myurl.Urls();
bubuko.com,布布扣         
bubuko.com,布布扣        
//System.out.println("startUrl:"+myurl.startUrl);
bubuko.com,布布扣        
//System.out.println("urlcontent:"+myurl.urlContent);
bubuko.com,布布扣        
//System.out.println("ContentArea:"+myurl.ContentArea);
bubuko.com,布布扣
 
bubuko.com,布布扣    }

bubuko.com,布布扣    
bubuko.com,布布扣    
bubuko.com,布布扣    
//初始化构造函数 strAreaBegin 和strAreaEnd
bubuko.com,布布扣
 
bubuko.com,布布扣    
public Urls (String strAreaBegin,String strAreaEnd)
bubuko.com,布布扣    
{
bubuko.com,布布扣        
this.strAreaBegin=strAreaBegin;
bubuko.com,布布扣        
this.strAreaEnd=strAreaEnd;
bubuko.com,布布扣  }

bubuko.com,布布扣  
bubuko.com,布布扣  
//
bubuko.com,布布扣
    public void Urls()
bubuko.com,布布扣    
{
bubuko.com,布布扣        
int i=0;
bubuko.com,布布扣        
//String regex ="<a href="?‘?http://[a-zA-Z0-9]+/.[a-zA-Z0-9]+/.[a-zA-Z]+/?[/.?[/S|/s]]+[a>]$";
bubuko.com,布布扣
        String regex ="<a.*?/a>";
bubuko.com,布布扣         
//String regex ="http://.*?>";
bubuko.com,布布扣
        Pattern pt=Pattern.compile(regex);
bubuko.com,布布扣        Matcher mt
=pt.matcher(ContentArea);
bubuko.com,布布扣        
while(mt.find())
bubuko.com,布布扣         
{
bubuko.com,布布扣                 System.out.println(mt.group());
bubuko.com,布布扣                 i
++;
bubuko.com,布布扣                 
bubuko.com,布布扣                 
//获取标题
bubuko.com,布布扣
                 Matcher title=Pattern.compile(">.*?</a>").matcher(mt.group()); 
bubuko.com,布布扣                 
while(title.find())
bubuko.com,布布扣                 
{
bubuko.com,布布扣                      System.out.println(
"标题:"+title.group().replaceAll(">|</a>",""));
bubuko.com,布布扣                 }

bubuko.com,布布扣                 
bubuko.com,布布扣                 
//获取网址
bubuko.com,布布扣
                 Matcher myurl=Pattern.compile("href=.*?>").matcher(mt.group()); 
bubuko.com,布布扣                 
while(myurl.find())
bubuko.com,布布扣                 
{
bubuko.com,布布扣                      System.out.println(
"网址:"+myurl.group().replaceAll("href=|>",""));
bubuko.com,布布扣                 }

bubuko.com,布布扣                 
bubuko.com,布布扣                 System.out.println();
bubuko.com,布布扣                 
bubuko.com,布布扣                 
bubuko.com,布布扣         }

bubuko.com,布布扣      
bubuko.com,布布扣        System.out.println(
"共有"+i+"个符合结果");
bubuko.com,布布扣        
bubuko.com,布布扣    }
    
bubuko.com,布布扣 
bubuko.com,布布扣    
bubuko.com,布布扣    
//获得开始采集网址
bubuko.com,布布扣
    public void getStartUrl(String startUrl)
bubuko.com,布布扣    
{
bubuko.com,布布扣        
this.startUrl=startUrl;
bubuko.com,布布扣    }

bubuko.com,布布扣    
bubuko.com,布布扣    
//获得网址所在内容;
bubuko.com,布布扣
    public void getUrlContent()
bubuko.com,布布扣    
{
bubuko.com,布布扣        
bubuko.com,布布扣        StringBuffer is
=new StringBuffer();
bubuko.com,布布扣        
try
bubuko.com,布布扣        
{
bubuko.com,布布扣            URL myUrl
=new URL(startUrl);
bubuko.com,布布扣            BufferedReader br
= new BufferedReader(
bubuko.com,布布扣                                                        
new InputStreamReader(myUrl.openStream()));
bubuko.com,布布扣                                                            
bubuko.com,布布扣            String s;                                                
bubuko.com,布布扣            
while((s=br.readLine())!=null)
bubuko.com,布布扣            
{
bubuko.com,布布扣                is.append(s);
bubuko.com,布布扣            }
                                            
bubuko.com,布布扣            urlContent
=is.toString();
bubuko.com,布布扣        }

bubuko.com,布布扣    
catch(Exception e)
bubuko.com,布布扣    
bubuko.com,布布扣    

bubuko.com,布布扣        System.out.println(
"网址文件未能输出");
bubuko.com,布布扣        e.printStackTrace();
bubuko.com,布布扣    }

bubuko.com,布布扣        
bubuko.com,布布扣        
bubuko.com,布布扣    }

bubuko.com,布布扣     
bubuko.com,布布扣    
bubuko.com,布布扣    
//获得网址所在的匹配区域部分
bubuko.com,布布扣
    public void getContentArea()
bubuko.com,布布扣    
{
bubuko.com,布布扣         
int pos1=0,pos2=0;
bubuko.com,布布扣         pos1
= urlContent.indexOf(strAreaBegin)+strAreaBegin.length();
bubuko.com,布布扣         pos2
=urlContent.indexOf(strAreaEnd,pos1);
bubuko.com,布布扣         ContentArea
=urlContent.substring(pos1,pos2); 
bubuko.com,布布扣    }

bubuko.com,布布扣    
bubuko.com,布布扣    
//以下两个函数获得网址应该要包含的关键字及不能包含的关键字
bubuko.com,布布扣    
//这里只做初步的实验。后期,保护的关键字及不能包含的关键字应该是不只一个的。
bubuko.com,布布扣
    public void getStringInUrl(String stringInUrl)
bubuko.com,布布扣    
{
bubuko.com,布布扣         
this.stringInUrl=stringInUrl;        
bubuko.com,布布扣          
bubuko.com,布布扣    }

bubuko.com,布布扣    
bubuko.com,布布扣    
public void getStringNotInUrl(String stringNotInUrl)
bubuko.com,布布扣    
{
bubuko.com,布布扣        
this.stringNotInUrl=stringNotInUrl;
bubuko.com,布布扣    }

bubuko.com,布布扣    
bubuko.com,布布扣    
//获取采集规则
bubuko.com,布布扣    
bubuko.com,布布扣    
//获取url网址
bubuko.com,布布扣
    public void getUrl()
bubuko.com,布布扣    
{
bubuko.com,布布扣     
bubuko.com,布布扣    }

bubuko.com,布布扣    
bubuko.com,布布扣    
public String getRegex()
bubuko.com,布布扣    
{
bubuko.com,布布扣        
return regex;
bubuko.com,布布扣        
bubuko.com,布布扣    }

bubuko.com,布布扣    
bubuko.com,布布扣    
class UrlAndTitle
bubuko.com,布布扣    
{
bubuko.com,布布扣        String myURL;
bubuko.com,布布扣        String title;
bubuko.com,布布扣    }

bubuko.com,布布扣}

Java正则表达式获取网页所有网址和链接文字,布布扣,bubuko.com

Java正则表达式获取网页所有网址和链接文字

原文:http://www.cnblogs.com/zuzwn/p/3614978.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!