首页 > Web开发 > 详细

正则抓取网页所有href和src

时间:2014-07-01 19:04:03      阅读:413      评论:0      收藏:0      [点我收藏+]

根据抓取的页面,用正则来匹配页面href和src

bubuko.com,布布扣
string UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:29.0) Gecko/20100101 Firefox/29.0";
    string ContentType = "";

    Uri strReqUrl = new Uri("http://m.lhrb.ufstone.net/");
    protected void Application_BeginRequest(object sender, EventArgs e)
    {

        Uri u = new Uri(strReqUrl, Request.RawUrl);
        byte[] b = getVerificationCode(u);

        //MemoryStream ms = new MemoryStream(b);
        //Response.ClearContent();
        //Response.ContentType = ContentType;
        //Response.BinaryWrite(b);

        StringBuilder strHtml = new StringBuilder(Encoding.GetEncoding("gb2312").GetString(b));
        StringBuilder sb = new StringBuilder();
        GetHtmlUrl(ref strHtml);
        Response.Write(strHtml.ToString());
        Response.End();
    }
    public byte[] getVerificationCode(Uri url)
    {
        WebClient MyWebClient = new WebClient();
        MyWebClient.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        MyWebClient.Headers.Add("Accept-Language", "    zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
        MyWebClient.Headers.Add("User-Agent", this.UserAgent);
        MyWebClient.Credentials = CredentialCache.DefaultCredentials;
        try
        {
            Byte[] pageData = MyWebClient.DownloadData(url.AbsoluteUri);
            ContentType = MyWebClient.ResponseHeaders["Content-Type"];
            return (pageData);
        }
        catch
        {
            return null;
        }
    }
View Code

 

    void GetHtmlUrl(ref StringBuilder strHtml)
    {
        //string headstr = "(src|href)=", endstr = "(\")";
        //string reg = @"(?<=" + headstr + ")(.*?)(?=" + endstr + ")";

        string reg = "(src|href)\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))";
        Regex r = new Regex(reg, RegexOptions.None);
        Match match = r.Match(strHtml.ToString());
        StringBuilder sb = new StringBuilder();
        while (match.Success)
        {
            //sb.Append(match.Groups["url"].Value + "\n");//得到href值                
            //sb.Append(match.Groups["text"].Value + "\n");//得到<a><a/>中间的内容     

            sb.Append(match + "\n");//得到href值     
            match = match.NextMatch();
            //try
            //{
            //    Uri u = new Uri(strReqUrl, match.Value.Replace("\"", "").Replace("‘", ""));
            //    strHtml.Replace(match.Value, @"/" + u.ToString().Replace(strReqUrl.ToString(), ""));
            //}
            //catch
            //{
            //}
        }
    }

 

正则抓取网页所有href和src,布布扣,bubuko.com

正则抓取网页所有href和src

原文:http://www.cnblogs.com/xuxiaoshuan/p/3817662.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!