1 /// <summary>
2 /// 去除HTML标记
3 /// </summary>
4 /// <param name=”NoHTML”>包括HTML的源码 </param>
5 /// <returns>已经去除后的文字</returns>
6 public static string NoHTML(string Htmlstring)
7 {
8 //删除脚本
9 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "",
10 RegexOptions.IgnoreCase);
11 //删除HTML
12 Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "",
13 RegexOptions.IgnoreCase);
14 Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "",
15 RegexOptions.IgnoreCase);
16 Htmlstring = Regex.Replace(Htmlstring, @"–>", "", RegexOptions.IgnoreCase);
17 Htmlstring = Regex.Replace(Htmlstring, @"<!–.*", "", RegexOptions.IgnoreCase);
18 Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"",
19 RegexOptions.IgnoreCase);
20 Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&",
21 RegexOptions.IgnoreCase);
22 Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<",
23 RegexOptions.IgnoreCase);
24 Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">",
25 RegexOptions.IgnoreCase);
26 Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ",
27 RegexOptions.IgnoreCase);
28 Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
29 Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
30 Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
31 Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
32 Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
33 Htmlstring.Replace("<", "");
34 Htmlstring.Replace(">", "");
35 Htmlstring.Replace("\r\n", "");
36 Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
37 return Htmlstring;
38 }
1 写一个静态方法移除HTML标签
2 #region
3 /// <summary>
4 /// 移除HTML标签
5 /// </summary>
6 /// <param name="HTMLStr">HTMLStr</param>
7 public static string ParseTags(string HTMLStr)
8 {
9 return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", "");
10 }
11 #endregion
1 #region
2 /// <summary>
3 /// 取出文本中的图片地址
4 /// </summary>
5 /// <param name="HTMLStr">HTMLStr</param>
6 public static string GetImgUrl(string HTMLStr)
7 {
8 string str = string.Empty;
9 string sPattern = @"^<img\s+[^>]*>";
10 Regex r = new Regex(@"<img\s+[^>]*\s*src\s*=\s*([']?)(?<url>\S+)'?[^>]*>",
11 RegexOptions.Compiled);
12 Match m = r.Match(HTMLStr.ToLower());
13 if (m.Success)
14 str = m.Result("${url}");
15 return str;
16 }
17 #endregion
1 /// <summary>
2 /// 提取HTML代码中文字的C#函数
3 /// </summary>
4 /// <param name="strHtml">包括HTML的源码 </param>
5 /// <returns>已经去除后的文字</returns>
6 using System;
7 using System.Text.RegularExpressions;
8 public class StripHTMLTest
9 {
10 public static void Main()
11 {
12 string s = StripHTML(
13 "<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");
14 Console.WriteLine(s);
15 }
16
17 public static string StripHTML(string strHtml)
18 {
19 string[]aryReg =
20 {
21 @"<script[^>]*?>.*?</script>",
22
23 @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\["
24 "'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", @"([\r\n])[\s]+", @
25 "&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @
26 "&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);",
27 @"&(copy|#169);", @"&#(\d+);", @"-->", @"<!--.*\n"
28 };
29
30 string[]aryRep =
31 {
32 "", "", "", "\"", "&", "<", ">", " ", "\xa1", //chr(161),
33 "\xa2", //chr(162),
34 "\xa3", //chr(163),
35 "\xa9", //chr(169),
36 "", "\r\n", ""
37 };
38
39 string newReg = aryReg[0];
40 string strOutput = strHtml;
41 for (int i = 0; i < aryReg.Length; i++)
42 {
43 Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);
44 strOutput = regex.Replace(strOutput, aryRep[i]);
45 }
46 strOutput.Replace("<", "");
47 strOutput.Replace(">", "");
48 strOutput.Replace("\r\n", "");
49 return strOutput;
50 }
51 }