https://msdn.microsoft.com/zh-cn/library/system.text.regularexpressions.regex(v=vs.110).aspx
/// <summary>
///
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void StatsoneForm_Load(object sender, EventArgs e)
{
string s = @"<tr height=‘19‘ style=‘height:14.25pt;mso-height-source:userset;mso-height-alt:285;‘> <td class=‘xl67‘ height=‘19‘ style=‘height:14.25pt;‘></td> <td class=‘xl71‘ x:num>110000</td> <td class=‘xl71‘ x:str>北京市</td> <td class=‘xl67‘></td> <td class=‘xl70‘></td> <td class=‘xl70‘></td> <td class=‘xl70‘></td> <td colspan=‘3‘ style=‘mso-ignore:colspan;‘></td> </tr>";
string f = ExtensionPost(s);
MessageBox.Show(f);
string sb = @"<p class=‘msonormal‘ style=‘line-height: 150%‘><span lang=‘en-us‘ style=‘line-height: 150%; font-family: ‘times new roman‘, ‘serif‘; font-size: 12pt‘>110000<span> </span></span><span style=‘line-height: 150%; font-family: 宋体; font-size: 12pt‘>北京市</span></p>";
string fb = ExtensionPostb(sb);
MessageBox.Show(fb);
string strhtml = @"<p class=‘msonormal‘ style=‘line-height: 150%‘><span lang=‘en-us‘ style=‘line-height: 150%; font-family: ‘times new roman‘, ‘serif‘; font-size: 12pt‘>110000<span> </span></span><span style=‘line-height: 150%; font-family: 宋体; font-size: 12pt‘>北京市</span></p><p class=‘msonormal‘ style=‘line-height: 150%‘><span lang=‘en-us‘ style=‘line-height: 150%; font-family: ‘times new roman‘, ‘serif‘; font-size: 12pt‘>110100<span> </span></span><span style=‘line-height: 150%; font-family: 宋体; font-size: 12pt‘>市辖区</span></p><p class=‘msonormal‘ style=‘line-height: 150%‘><span lang=‘en-us‘ style=‘line-height: 150%; font-family: ‘times new roman‘, ‘serif‘; font-size: 12pt‘>110101<span> </span></span><span style=‘line-height: 150%; font-family: 宋体; font-size: 12pt‘>东城区</span></p><p class=‘msonormal‘ style=‘line-height: 150%‘><span lang=‘en-us‘ style=‘line-height: 150%; font-family: ‘times new roman‘, ‘serif‘; font-size: 12pt‘>110102<span> </span></span><span style=‘line-height: 150%; font-family: 宋体; font-size: 12pt‘>西城区</span></p><p class=‘msonormal‘ style=‘line-height: 150%‘><span lang=‘en-us‘ style=‘line-height: 150%; font-family: ‘times new roman‘, ‘serif‘; font-size: 12pt‘>110105<span> </span></span><span style=‘line-height: 150%; font-family: 宋体; font-size: 12pt‘>朝阳区</span></p><p class=‘msonormal‘ style=‘line-height: 150%‘><span lang=‘en-us‘ style=‘line-height: 150%; font-family: ‘times new roman‘, ‘serif‘; font-size: 12pt‘>110106<span> </span></span><span style=‘line-height: 150%; font-family: 宋体; font-size: 12pt‘>丰台区</span></p>";
IEnumerable<AreaHtmlValue> htmlValue = GetRegValue(@"<p class=‘msonormal‘ style=‘line-height: 150%‘><span lang=‘en-us‘ style=‘line-height: 150%; font-family: ‘times new roman‘, ‘serif‘; font-size: 12pt‘>(?<code>\d+)<span> </span></span><span style=‘line-height: 150%; font-family: 宋体; font-size: 12pt‘>(?<name>\w*)</span></p>", strhtml);
List<AreaInfo> areaList = (from v in htmlValue
// let name = v.Name.Replace("自治区直辖县级行政区划", "县").Replace("市辖区", "市").Replace("省直辖县级行政区划", "县")
// .Replace("县", "")
select new AreaInfo
{
AreaCode = v.Code.Substring(0, 5),
AreaName = v.Name,
AreaFullName = v.Name,
ParentAreaCode = "0",
ParentId = 0,
CreateTime = DateTime.Now,
AreaYear = 2015
}).ToList();
this.dataGridView1.DataSource = areaList;
}
/// <summary>
/// <tr height=‘19‘ style=‘height:14.25pt;mso-height-source:userset;mso-height-alt:285;‘>
/// <td class=‘xl67‘ height=‘19‘ style=‘height:14.25pt;‘></td>
/// <td class=‘xl71‘ x:num>654326</td>
///<td class=‘xl71‘ x:str><span style=‘mso-spacerun:yes;‘> </span><font class=‘font3‘>吉木乃县</font></td>
///<td class=‘xl67‘></td>
///<td class=‘xl70‘></td>
///<td class=‘xl70‘></td>
///<td class=‘xl70‘></td>
///<td colspan=‘3‘ style=‘mso-ignore:colspan;‘></td>
///</tr>
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
static String ExtensionPost(String url)
{
//<tr height=‘19‘ style=‘height:14.25pt;mso-height-source:userset;mso-height-alt:285;‘> <td class=‘xl67‘ height=‘19‘ style=‘height:14.25pt;‘></td> <td class=‘xl71‘ x:num>110000</td> <td class=‘xl71‘ x:str>北京市</td> <td class=‘xl67‘></td> <td class=‘xl70‘></td> <td class=‘xl70‘></td> <td class=‘xl70‘></td> <td colspan=‘3‘ style=‘mso-ignore:colspan;‘></td> </tr>
//<p class=‘msonormal‘ style=‘line-height: 150%‘><span lang=‘en-us‘ style=‘line-height: 150%; font-family: ‘times new roman‘, ‘serif‘; font-size: 12pt‘>110000<span> </span></span><span style=‘line-height: 150%; font-family: 宋体; font-size: 12pt‘> 北京市</span></p>
//<tr height=‘19‘ style=‘height:14.25pt;mso-height-source:userset;mso-height-alt:285;‘><td class=‘xl67‘ height=‘19‘ style=‘height:14.25pt;‘></td><td class=‘xl71‘ x:num>654326</td><td class=‘xl71‘ x:str><span style=‘mso-spacerun:yes;‘> </span><font class=‘font3‘>吉木乃县</font></td><td class=‘xl67‘></td><td class=‘xl70‘></td><td class=‘xl70‘></td><td class=‘xl70‘></td><td colspan=‘3‘ style=‘mso-ignore:colspan;‘></td></tr>
// Regex r = new Regex(@"<tr height=‘19‘ style=‘height:14.25pt;mso-height-source:userset;mso-height-alt:285;‘><td class=‘xl67‘ height=‘19‘ style=‘height:14.25pt;‘></td><td class=‘xl71‘ x:num>(?<port>\d+)</td><td class=‘xl71‘ x:str><span style=‘mso-spacerun:yes;‘> </span><font class=‘font3‘>(?<proto>\w+)</font></td><td class=‘xl67‘></td><td class=‘xl70‘></td><td class=‘xl70‘></td><td class=‘xl70‘></td><td colspan=‘3‘ style=‘mso-ignore:colspan;‘></td></tr>",
Regex r = new Regex(@"<tr height=‘19‘ style=‘height:14.25pt;mso-height-source:userset;mso-height-alt:285;‘> <td class=‘xl67‘ height=‘19‘ style=‘height:14.25pt;‘></td> <td class=‘xl71‘ x:num>(?<port>\d+)</td> <td class=‘xl71‘ x:str>(?<proto>\w+)</td> <td class=‘xl67‘></td> <td class=‘xl70‘></td> <td class=‘xl70‘></td> <td class=‘xl70‘></td> <td colspan=‘3‘ style=‘mso-ignore:colspan;‘></td> </tr>",
RegexOptions.Compiled);
return r.Match(url).Result("${proto}${port}");
}
/// <summary>
///
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
static string ExtensionPostb(string url)
{
Regex r = new Regex(@"<p class=‘msonormal‘ style=‘line-height: 150%‘><span lang=‘en-us‘ style=‘line-height: 150%; font-family: ‘times new roman‘, ‘serif‘; font-size: 12pt‘>(?<port>\d+)<span> </span></span><span style=‘line-height: 150%; font-family: 宋体; font-size: 12pt‘>(?<proto>\w*)</span></p>",
RegexOptions.Compiled);
return r.Match(url).Result("${proto}${port}");
}
/// <summary>
/// http://files2.mca.gov.cn/www/201512/20151224151630189.htm
/// <tr height="19" style=‘height:14.25pt;mso-height-source:userset;mso-height-alt:285;‘><td class="xl67" height="19" style=‘height:14.25pt;‘></td><td class="xl71" x:num>110000</td><td class="xl71" x:str>北京市</td><td class="xl67"></td><td class="xl70"></td><td class="xl70"></td><td class="xl70"></td><td colspan="3" style=‘mso-ignore:colspan;‘></td></tr>
/// http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html
/// <p class="MsoNormal" style="line-height: 150%"><span lang="EN-US" style="line-height: 150%; font-family: ‘Times New Roman‘, ‘serif‘; font-size: 12pt">110000<span> </span></span><span style="line-height: 150%; font-family: 宋体; font-size: 12pt"> 北京市</span></p>
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void button1_Click(object sender, EventArgs e)
{
try
{
//1
string url = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html";
//2
//string url = "http://files2.mca.gov.cn/www/201512/20151224151630189.htm";
// 获取相关HTML块
//IEnumerable<AreaHtmlValue> htmlValue =GetRegValue(@"<tr class=‘villagetr‘><td>(?<code>\d{12})</td><td>(?<type>\d{3})</td><td>(?<name>\w*)</td></tr>",GetHtml(url));
//<p class=‘msonormal‘ style=‘line-height: 150%‘><span lang=‘en-us‘ style=‘line-height: 150%; font-family: ‘times new roman‘, ‘serif‘; font-size: 12pt‘>110000<span> </span></span><span style=‘line-height: 150%; font-family: 宋体; font-size: 12pt‘> 北京市</span></p>
//1
IEnumerable<AreaHtmlValue> htmlValue = GetRegValue(@"<p class=‘msonormal‘ style=‘line-height: 150%‘><span lang=‘en-us‘ style=‘line-height: 150%; font-family: ‘times new roman‘, ‘serif‘; font-size: 12pt‘>(?<code>\d+)<span> </span></span><span style=‘line-height: 150%; font-family: 宋体; font-size: 12pt‘>(?<name>\w*)</span></p>", GetHtml(url));
//2
//IEnumerable<AreaHtmlValue> htmlValue = GetRegValue(@"<tr height=‘19‘ style=‘height:14.25pt;mso-height-source:userset;mso-height-alt:285;‘><td class=‘xl67‘ height=‘19‘ style=‘height:14.25pt;‘></td><td class=‘xl71‘ x:num>(?<code>\d+)</td><td class=‘xl71‘ x:str><span style=‘mso-spacerun:yes;‘> </span><font class=‘font3‘>(?<name>\w*)</font></td><td class=‘xl67‘></td><td class=‘xl70‘></td><td class=‘xl70‘></td><td class=‘xl70‘></td><td colspan=‘3‘ style=‘mso-ignore:colspan;‘></td></tr>", GetHtml(url));
//this.richTextBox1.Text = GetHtml(url);
// this.textBox1.Text = GetHtml(url);
// this.textBox1.SelectAll();
List<AreaInfo> areaList = (from v in htmlValue
//let name = v.Name.Replace("自治区直辖县级行政区划", "县").Replace("市辖区", "市").Replace("省直辖县级行政区划", "县")
//.Replace("县", "")
select new AreaInfo
{
AreaCode = v.Code.Substring(0, 6),
AreaName = v.Name,
AreaFullName = v.Name,
ParentAreaCode = "0",
ParentId = 0,
CreateTime = DateTime.Now,
AreaYear = 2015
}).ToList();
this.dataGridView2.DataSource = areaList;
WebClient wc = new WebClient();
string mainData = Encoding.UTF8.GetString(wc.DownloadData(string.Format("http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html")));
this.richTextBox2.Text =RemoveScript(RemoveStyle(ReplaceEnter(mainData)));
// this.textBox2.Text = RemoveScript(RemoveStyle(ReplaceEnter(mainData)));
// this.textBox2.SelectAll();
}
catch (Exception ex)
{
ex.Message.ToString();
}
}
#region 网页源码
/// <summary>
///
/// </summary>
/// <param name="url"></param>
private static void updowndimg(string url)
{
WebClient client = new WebClient();
string html = client.DownloadString(url);
MatchCollection matches = Regex.Matches(html, "<img\\s*.*src=\"(.+?)\".*/>");
for (int i = 0; i < matches.Count; i++)
{
string img = matches[i].Groups[1].Value.Replace("\"", string.Empty);
img = "url/" + img;
client.DownloadFile(img, @"c:\g\" + Path.GetFileName(img));
Console.WriteLine(img);
}
Console.ReadKey();
}
/// <summary>
/// 例如,Find_po在字开头处查找以"po"开头的字符串:
/// </summary>
static void Find_po()
{
string text = @" I can not find my position in Beijing ";
string pattern = @"\bpo\S*ion\b";
MatchCollection matches = Regex.Matches(text, pattern, RegexOptions.IgnoreCase
| RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
WriteMatches(text, matches);
}
/// <summary>
///
/// </summary>
/// <param name="text"></param>
/// <param name="matches"></param>
static void WriteMatches(string text, MatchCollection matches)
{
Console.WriteLine("Original text was: \n\n" + text + "\n");
Console.WriteLine("No. of matches: " + matches.Count);
foreach (Match nextMatch in matches)
{
int Index = nextMatch.Index;
string result = nextMatch.ToString();
int charsBefore = (Index < 5) ? Index : 5;
int fromEnd = text.Length - Index - result.Length;
int charsAfter = (fromEnd < 5) ? fromEnd : 5;
int charsToDisplay = charsBefore + charsAfter + result.Length;
Console.WriteLine("Index: {0}, \tString: {1}, \t{2}", Index, result,
text.Substring(Index - charsBefore, charsToDisplay));
}
}
/// <summary>
/// 如,"http://www.yahoo.com.cn:8080/index.html"将返回"http:8080"。
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
String Extension(String url)
{
Regex r = new Regex(@"^(?<proto>\w+)://[^/]+?(?<port>:\d+)?/",
RegexOptions.Compiled);
return r.Match(url).Result("${proto}${port}");
}
/// <summary>
/// 获取远程网页源码
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
private static string GetHtml(string url)
{
try
{
WebRequest webRequest = WebRequest.Create(url);
WebResponse webResponse = webRequest.GetResponse();
Stream reader = webResponse.GetResponseStream();
if (reader != null)
{
var respStreamReader = new StreamReader(reader, Encoding.UTF8); //
var cbuffer = new char[1024];
int byteRead = respStreamReader.Read(cbuffer, 0, 256);
string strBuff = string.Empty;
while (byteRead != 0)
{
var strResp = new string(cbuffer, 0, byteRead);
strBuff = strBuff + strResp;
byteRead = respStreamReader.Read(cbuffer, 0, 256);
}
strBuff = RemoveScript(RemoveStyle(ReplaceEnter(strBuff)));
return strBuff;
}
}
catch (Exception)
{
AreaLogHelper.WriteLogFile("【异常URL】" + url);
Console.WriteLine("【异常URL】" + url);
}
return string.Empty;
}
/// <summary>
/// 替换网页中的换行和引号
/// </summary>
/// <param name="htmlCode">HTML源代码</param>
/// <returns></returns>
private static string ReplaceEnter(string htmlCode)
{
if (string.IsNullOrEmpty(htmlCode))
return string.Empty;
return htmlCode.Replace("\r\n", "").Replace("\"", "‘").Replace("\n", "").Replace("\r", "").Replace(" ", "").Replace(" ", "").Replace(" ", "").Replace(" ", "").Replace(" ", "").ToLower();//.Replace("\"", "").Replace(" ", "")
}
#region private methods
private static string RemoveComment(string input)
{
string result = input;
//remove comment
result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase);
return result;
}
private static string RemoveStyle(string input)
{
string result = input;
//remove all styles
result = Regex.Replace(result, @"<style[^>]*?>.*?</style>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
return result;
}
private static string RemoveScript(string input)
{
string result = input;
result = Regex.Replace(result, @"<script[^>]*?>.*?</script>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
result = Regex.Replace(result, @"<noscript[^>]*?>.*?</noscript>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
return result;
}
private static string RemoveTags(string input)
{
string result = input;
result = result.Replace(" ", " ");
result = result.Replace("", "\"");
result = result.Replace("<", "<");
result = result.Replace(">", ">");
result = result.Replace("&", "&");
result = result.Replace("<br>", "\r\n");
result = Regex.Replace(result, @"<[\s\S]*?>", string.Empty, RegexOptions.IgnoreCase);
return result;
}
#endregion
/// <summary>
/// 执行正则提取出值
/// </summary>
/// <param name="regexString">正则表达式</param>
/// <param name="remoteStr">HtmlCode源代码</param>
/// <returns></returns>
private static IEnumerable<AreaHtmlValue> GetRegValue(string regexString, string remoteStr)
{
var reg = new Regex(regexString, RegexOptions.Compiled);//RegexOptions.Compiled
MatchCollection mc = reg.Matches(remoteStr);
return (from Match m in mc
select new AreaHtmlValue
{
Code = m.Groups["code"].Value,
Name = m.Groups["name"].Value,
// Type = m.Groups["type"].Value
}).ToList();
}
private class AreaHtmlValue
{
public string Code { get; set; }
public string Name { get; set; }
public string Type { get; set; }
}
#endregion
}
原文:http://www.cnblogs.com/geovindu/p/5200334.html