1 /// <summary>
2 /// 功能描述:正则取得HTML中所有图片的 URL
3 /// </summary>
4 /// <param name="sHtmlText">HTML代码</param>
5 /// <returns>图片的URL列表【href】</returns>
6 public static List<string> GetHtmlImageUrlList(string sHtmlText)
7 {
8 List<string> imgList = new List<string>();
9 // 定义正则表达式用来匹配 img 标签
10 Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
11
12 // 搜索匹配的字符串
13 MatchCollection matches = regImg.Matches(sHtmlText);
14 int i = 0;
15 string[] sUrlList = new string[matches.Count];
16
17 // 取得匹配项列表
18 foreach (Match match in matches)
19 {
20 imgList.Add(match.Groups["imgUrl"].Value);
21 }
22 return imgList;
23 }
24
25 /// <summary>
26 /// 功能描述:正则取得HTML中所有锚标签
27 /// </summary>
28 /// <param name="sHtmlText">HTML代码</param>
29 /// <returns>所有的锚点标签【A】</returns>
30 public static List<string> GetHtmlAnchorlList(string sHtmlText)
31 {
32 List<string> achorList = new List<string>();
33 //定义正则表达式用来匹配锚点
34 Regex regAchor = new Regex(@"<a\sname=""(.+?)</a>",RegexOptions.Multiline);
35 // 搜索匹配的字符串
36 MatchCollection matches = regAchor.Matches(sHtmlText);
37 int i = 0;
38 string[] sUrlList = new string[matches.Count];
39
40 // 取得匹配项列表
41 foreach (Match match in matches)
42 {
43 //去除HTML中的标签,只获得纯文本
44 achorList.Add(GetTextNoHtml(match.Groups[1].Value));
45 }
46 return achorList;
47
48 }
49
50 /// <summary>
51 /// 功能描述:正则表达式获取HTML所有的文本,不需要HTML标签
52 /// 最强功能提示:可以自动生成文章摘要
53 /// </summary>
54 /// <param name="sHtmlText">HTML代码</param>
55 /// <param name="length">提取文本的长度</param>
56 /// <returns>提取后的纯文本数据</returns>
57 public static string GetTextNoHtml(string sHtmlText, int length = 0)
58 {
59
60 //删除脚本
61 sHtmlText = Regex.Replace(sHtmlText, @"<script[^>]+?>[\s\S]*?</script>", "", RegexOptions.IgnoreCase);
62 //删除HTML
63 sHtmlText = Regex.Replace(sHtmlText, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
64 sHtmlText = Regex.Replace(sHtmlText, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
65 sHtmlText = Regex.Replace(sHtmlText, @"-->", "", RegexOptions.IgnoreCase);
66 sHtmlText = Regex.Replace(sHtmlText, @"<!--.*", "", RegexOptions.IgnoreCase);
67 sHtmlText = Regex.Replace(sHtmlText, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
68 sHtmlText = Regex.Replace(sHtmlText, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
69 sHtmlText = Regex.Replace(sHtmlText, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
70 sHtmlText = Regex.Replace(sHtmlText, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
71 sHtmlText = Regex.Replace(sHtmlText, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
72 sHtmlText = Regex.Replace(sHtmlText, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
73 sHtmlText = Regex.Replace(sHtmlText, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
74 sHtmlText = Regex.Replace(sHtmlText, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
75 sHtmlText = Regex.Replace(sHtmlText, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
76 sHtmlText = Regex.Replace(sHtmlText, @"&#(\d+);", "", RegexOptions.IgnoreCase);
77 sHtmlText = sHtmlText.Replace("\"", "");
78 sHtmlText = Regex.Replace(sHtmlText, @"//\(function\(\)[\s\S]+?}\)\(\);", "", RegexOptions.IgnoreCase);
79 sHtmlText = sHtmlText.Replace("<", "");
80 sHtmlText = sHtmlText.Replace(">", "");
81 sHtmlText = sHtmlText.Replace("\r\n", "");
82
83 if (length > 0 && sHtmlText.Length > length)
84 return sHtmlText.Substring(0, length);
85
86 return sHtmlText;
87 }
88
89 /// <summary>
90 /// 功能描述:正则表达式获取分页HTML中文本包含或者不包含指定内容的A标签的第一个A标签
91 /// 实例: <a href="">曼码科技</a>,提取包换曼码科技,则此A标签可以获取到;如果不想包换曼码科技,则此标签无需获取到。
92 /// (可以升级:集合版)
93 /// </summary>
94 /// <param name="html">要处理的HTML</param>
95 /// <param name="text">包含的指定内容</param>
96 /// <param name="iscontain">是否包含</param>
97 /// <returns>返回第一符合规则的A标签</returns>
98 public static string GetHtmlPageSelectA(string html,string text,bool iscontain)
99 {
100 string url = string.Empty;
101 Regex reg = new Regex(@"<a href=""(?<url>.*?)""(.*?)>(?<text>.*?)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
102 MatchCollection mctitleList = reg.Matches(html);
103 if (mctitleList.Count > 0)
104 {
105 foreach (Match m in mctitleList)
106 {
107 string str = m.Groups["text"].Value.ToString().RemoveHtml();
108 if (iscontain)
109 {
110 if (str.Trim().Contains(text))
111 {
112 url = m.Groups["url"].Value.ToString().Trim();
113 break;
114 }
115 }
116 else
117 {
118 if (str.Trim().Equals(text))
119 {
120 url = m.Groups["url"].Value.ToString().Trim();
121 break;
122 }
123 }
124 }
125 }
126 return url;
127 }