C#解析单层html的中的文本,然后拼接起来

匹配单层html的小demo,应该能匹配大多数html字符串.多层(嵌套)html标签解析不出来.可能有小bug,我抛砖引玉下,哈哈.

 1 using System;
 2 using System.Collections.Generic;
 3 using System.Linq;
 4 using System.Text;
 5 using System.Text.RegularExpressions;
 6 using System.Threading.Tasks;
 7 
 8 namespace ResolveHtmlText
 9 {
10     class Program
11     {
12         static void Main(string[] args)
13         {
14             string text = @"&nbsp;<span style='color:#1F497D'><span>y<span></span>&nbsp; &nbsp;<span style='color:#1F497D;'>1</span>&nbsp;<span style='color:#1F497D;background-color:#123456'>2</span><span style='color:#1F497D;background-color:#123456;text-align:center'>3</span>  <span style='color:#1F497D;background-color:#123456;text-align:center;'>4</span> <span style='color:#1F497D;background-color:#123456;text-align:center;tt-l: 134;'>5</span>ggjf<a>123456</a>";
15             Console.WriteLine("原字符串:" + text);
16             text = text.Replace("\"", "'");
17             text = text.Replace("&quot;", "'");
18             text = text.Replace("&nbsp;", "");
19             text = text.Replace("&lt;", "<");//将<的转义码&lt;都替换成<
20             text = text.Replace("&gt;", ">");//将>的转义码&gt;都替换成>
21 
22             //string matchStr = @"<\s*[a-zA-Z0-9]+\s*>[^<^>]*<\s*/\s*[a-zA-Z0-9]+\s*>";
23             string matchStr = @"<\s*[a-zA-Z0-9]+\s*[a-zA-Z]+\s*=\s*'\s*[a-zA-Z]"
24                             + @"+\s*:\s*[^<^>];?'\s*>[^<^>]"
25                             + @"*<\s*/\s*[a-zA-Z0-9]+\s*>|<\s*[a-zA-Z0-9]"
26                             + @"+\s*(\s*[a-zA-Z-]+\s*=\s*'(\s*[a-zA-Z-]+\s*:"
27                             + @"\s*[^:^;^<^>]+\s*;\s*)*(\s*[a-zA-Z-]+\s*:\s*"
28                             + @"[^:^;^<^>]+\s*)\s*;?\s*'\s*)*"
29                             + @"\s*>[^<^>]*<\s*/\s*[a-zA-Z0-9]+\s*>";
30 
31             Regex htmlReg = new Regex(matchStr);
32             string result = null;
33             MatchCollection htmlMatchCollection = htmlReg.Matches(text);
34             StringBuilder sb = new StringBuilder();
35             
36             foreach (Match m in htmlMatchCollection) 
37             {
38                 if (m != null && m.Groups != null && m.Groups.Count > 0)
39                 {
40                     string temp = m.Groups[0].Value;
41                     Console.WriteLine("临时值:" + temp);
42                     //Regex textReg1 = new Regex(@"[^<^>]+");
43                     //Match textMatch1 = textReg1.Match(temp);
44                     //if (textMatch1 != null && textMatch1.Groups != null && textMatch1.Groups.Count > 0)
45                     //{
46                     //    result = textMatch1.Groups[0].Value;
47                     //    sb.Append(result);
48                     //}
49                     Regex textReg = new Regex(@">.+<");
50                     Match textMatch = textReg.Match(temp);
51                     if (textMatch != null && textMatch.Groups != null && textMatch.Groups.Count > 0)
52                     {
53                         result = textMatch.Groups[0].Value;
54                         if (result.Length > 2)
55                         {
56                             result = result.Substring(1, result.Length - 2);
57                             sb.Append(result);
58                         }
59                     }                   
60                 }
61             }
62 
63             
64 
65             Console.WriteLine("解析出的结果:" + sb.ToString());
66             Console.ReadLine();
67         }
68     }
69 }

 

posted @ 2017-05-12 15:05  ラピスラズリ(Dawn)  阅读(917)  评论(0编辑  收藏  举报