c# 使用正则解析html

  1 #region 解析HTML
  2         /// <summary>
  3         /// 获取网页标签内容
  4         /// </summary>
  5         public static string[] RegexHtmlToFormat(string as_Html, string tags)
  6         {
  7             List<string> list = new List<string>();
  8             Regex regex = new Regex("<" + tags + "[^>]*?>[\\s\\S]*?<\\/" + tags + ">", RegexOptions.IgnoreCase | RegexOptions.Multiline);
  9             if (regex.IsMatch(as_Html))
 10             {
 11                 MatchCollection matchCollection = regex.Matches(as_Html);
 12                 foreach (Match match in matchCollection)
 13                 {
 14                     list.Add(match.Value);//获取到的
 15                 }
 16             }
 17             return list.ToArray();
 18         }
 19         /// <summary>   
 20         /// 取得HTML中所有图片的 URL。   
 21         /// </summary>   
 22         /// <param name="sHtmlText">HTML代码</param>   
 23         /// <returns>图片的URL列表</returns>   
 24         public static string[] GetHtmlImageUrlList(string sHtmlText)
 25         {
 26             // 定义正则表达式用来匹配 img 标签   
 27             Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
 28 
 29             // 搜索匹配的字符串   
 30             MatchCollection matches = regImg.Matches(sHtmlText);
 31             int i = 0;
 32             string[] sUrlList = new string[matches.Count];
 33 
 34             // 取得匹配项列表   
 35             foreach (Match match in matches)
 36                 sUrlList[i++] = match.Groups["imgUrl"].Value;
 37             return sUrlList;
 38         }
 39         /// <summary>
 40         /// 获取页面内所有漫画图片地址
 41         /// </summary>
 42         public static string[] RegexHtmlDiv(string as_Html, string className)
 43         {
 44             List<string> list = new List<string>();
 45             Regex regex = new Regex("<div class='" + className + "'>(.|\n)*?</div>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
 46             if (regex.IsMatch(as_Html))
 47             {
 48                 MatchCollection matchCollection = regex.Matches(as_Html);
 49                 foreach (Match match in matchCollection)
 50                 {
 51                     string ls_rc = match.Value;
 52                     list.Add(ls_rc);//获取到的
 53                 }
 54             }
 55             return list.ToArray();
 56         }
 57 
 58         /// <summary>
 59         /// 解析HTML
 60         /// <para>示例代码</para>
 61         /// </summary>
 62         public static string RegexHTMLList(string as_Html)
 63         {
 64             as_Html = as_Html.Replace("\t", "");
 65             as_Html = as_Html.Replace(" ", "");
 66             string ls_rc = "";
 67             Regex regex = new Regex("<tr>(?<CompanyName>.*?)</tr>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
 68             if (regex.IsMatch(as_Html))
 69             {
 70                 MatchCollection matchCollection = regex.Matches(as_Html);
 71                 foreach (Match match in matchCollection)
 72                 {
 73                     ls_rc += match.Value;//获取到的
 74                 }
 75             }
 76             return ls_rc;
 77         }
 78 
 79         /// <summary>  
 80         /// 获取字符中指定标签的值  
 81         /// </summary>  
 82         /// <param name="str">字符串</param>  
 83         /// <param name="title">标签</param>  
 84         /// <param name="attrib">属性名</param>  
 85         /// <returns>属性</returns>  
 86         public static string GetTitleContent(string str, string title, string attrib)
 87         {
 88             string tmpStr = string.Format("<{0}[^>]*?{1}=(['\"\"]?)(?<url>[^'\"\"\\s>]+)\\1[^>]*>", title, attrib); //获取<title>之间内容  
 89             Match TitleMatch = Regex.Match(str, tmpStr, RegexOptions.IgnoreCase);
 90             string result = TitleMatch.Groups["url"].Value;
 91             return result;
 92         }
 93         /// <summary>
 94         /// 解析控件的属性返回键值对
 95         /// </summary>
 96         /// <param name="HtmlElement"></param>
 97         /// <returns></returns>
 98         public static System.Collections.Hashtable getAttrs(string HtmlElement)
 99         {
100             System.Collections.Hashtable ht = new System.Collections.Hashtable();
101             MatchCollection mc = Regex.Matches(HtmlElement, "(?<name>[\\S^=]+)\\s*=\\s*\"(?<value>[^\"\"]+)\"|(?<name>[\\S^=]+)\\s*=\\s*'(?<value>[^'']+)'|(?<name>\\w+)=(?<value>[^\"])(?=[\\s])");
102             foreach (Match m in mc)
103             {
104                 ht[m.Groups[1].Value] = m.Groups[2].Value;
105             }
106             return ht;
107         }
108         #endregion
posted @ 2018-01-11 09:14 伏地魔程序员阅读(1386) 评论(0) 收藏举报
刷新页面返回顶部
伏地魔程序员

c# 使用正则解析html

公告