C#用正则表达式去掉Html中的script脚本和html标签

时间：2014-02-17 07:33:12 阅读：440 评论：0 收藏：0 [点我收藏+]

/// <summary>

/// 用正则表达式去掉Html中的script脚本和html标签

/// </summary>

/// <param 
name="Htmlstring"></param>

/// <returns></returns>

public static string NoHTML(string Htmlstring)

{

//删除脚本

Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", ""

, 
RegexOptions.IgnoreCase);

//删除HTML

Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", ""

, 
RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", ""

, 
RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"-->", ""

, 
RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", ""

, 
RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);",

"   
"

, RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", ""

, 
RegexOptions.IgnoreCase);

Htmlstring.Replace("<", "");

Htmlstring.Replace(">", "");

Htmlstring.Replace("\r\n", "");

Htmlstring = 
HttpUtility.HtmlDecode(Htmlstring).Replace(

"<br/>"

"").Replace("<br>"

"").Trim();

return Htmlstring;

}

原文：http://www.cnblogs.com/lonelyxmas/p/3551624.html

踩

(0)

评论一句话评论（0）

分享档案

更多>