1 #region 解析HTML
2 /// <summary>
3 /// 获取网页标签内容
4 /// </summary>
5 public static string[] RegexHtmlToFormat(string as_Html, string tags)
6 {
7 List<string> list = new List<string>();
8 Regex regex = new Regex("<" + tags + "[^>]*?>[\\s\\S]*?<\\/" + tags + ">", RegexOptions.IgnoreCase | RegexOptions.Multiline);
9 if (regex.IsMatch(as_Html))
10 {
11 MatchCollection matchCollection = regex.Matches(as_Html);
12 foreach (Match match in matchCollection)
13 {
14 list.Add(match.Value);//获取到的
15 }
16 }
17 return list.ToArray();
18 }
19 /// <summary>
20 /// 取得HTML中所有图片的 URL。
21 /// </summary>
22 /// <param name="sHtmlText">HTML代码</param>
23 /// <returns>图片的URL列表</returns>
24 public static string[] GetHtmlImageUrlList(string sHtmlText)
25 {
26 // 定义正则表达式用来匹配 img 标签
27 Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
28
29 // 搜索匹配的字符串
30 MatchCollection matches = regImg.Matches(sHtmlText);
31 int i = 0;
32 string[] sUrlList = new string[matches.Count];
33
34 // 取得匹配项列表
35 foreach (Match match in matches)
36 sUrlList[i++] = match.Groups["imgUrl"].Value;
37 return sUrlList;
38 }
39 /// <summary>
40 /// 获取页面内所有漫画图片地址
41 /// </summary>
42 public static string[] RegexHtmlDiv(string as_Html, string className)
43 {
44 List<string> list = new List<string>();
45 Regex regex = new Regex("<div class='" + className + "'>(.|\n)*?</div>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
46 if (regex.IsMatch(as_Html))
47 {
48 MatchCollection matchCollection = regex.Matches(as_Html);
49 foreach (Match match in matchCollection)
50 {
51 string ls_rc = match.Value;
52 list.Add(ls_rc);//获取到的
53 }
54 }
55 return list.ToArray();
56 }
57
58 /// <summary>
59 /// 解析HTML
60 /// <para>示例代码</para>
61 /// </summary>
62 public static string RegexHTMLList(string as_Html)
63 {
64 as_Html = as_Html.Replace("\t", "");
65 as_Html = as_Html.Replace(" ", "");
66 string ls_rc = "";
67 Regex regex = new Regex("<tr>(?<CompanyName>.*?)</tr>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
68 if (regex.IsMatch(as_Html))
69 {
70 MatchCollection matchCollection = regex.Matches(as_Html);
71 foreach (Match match in matchCollection)
72 {
73 ls_rc += match.Value;//获取到的
74 }
75 }
76 return ls_rc;
77 }
78
79 /// <summary>
80 /// 获取字符中指定标签的值
81 /// </summary>
82 /// <param name="str">字符串</param>
83 /// <param name="title">标签</param>
84 /// <param name="attrib">属性名</param>
85 /// <returns>属性</returns>
86 public static string GetTitleContent(string str, string title, string attrib)
87 {
88 string tmpStr = string.Format("<{0}[^>]*?{1}=(['\"\"]?)(?<url>[^'\"\"\\s>]+)\\1[^>]*>", title, attrib); //获取<title>之间内容
89 Match TitleMatch = Regex.Match(str, tmpStr, RegexOptions.IgnoreCase);
90 string result = TitleMatch.Groups["url"].Value;
91 return result;
92 }
93 /// <summary>
94 /// 解析控件的属性返回键值对
95 /// </summary>
96 /// <param name="HtmlElement"></param>
97 /// <returns></returns>
98 public static System.Collections.Hashtable getAttrs(string HtmlElement)
99 {
100 System.Collections.Hashtable ht = new System.Collections.Hashtable();
101 MatchCollection mc = Regex.Matches(HtmlElement, "(?<name>[\\S^=]+)\\s*=\\s*\"(?<value>[^\"\"]+)\"|(?<name>[\\S^=]+)\\s*=\\s*'(?<value>[^'']+)'|(?<name>\\w+)=(?<value>[^\"])(?=[\\s])");
102 foreach (Match m in mc)
103 {
104 ht[m.Groups[1].Value] = m.Groups[2].Value;
105 }
106 return ht;
107 }
108 #endregion