几个C#关于Html解析的类
命名空间里有两个类:1.HtmlUtil;2.Htmlpage。分别引用的别人的写好的类,其中也做了不少改变。一开始是用HtmlUtil解析网页,它使用正则表达式解析HTML网页,后来发现某些情况下解析的不是太好。后来在sourceforge里面发现了MLIHTML拿过来用了一下,还不错。
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
using System.Text.RegularExpressions;
5
using MIL.Html;
6
7
namespace Yuanso.Sitework.Crawler
8
{
9
public class HtmlUtil
10
{
11
/// <summary>
12
/// Written: [CHINA] Zhang Liu
13
/// Date: 1,Jun,2006
14
/// Version: 1.0
15
/// Support: MYBASK <see cref="http://www.mybask.net"/>
16
/// Looking for latest version or similar implementation of this function, please visit: <seealso cref="http://www.mybask.net"/>
17
/// Summary:
18
/// Picking up text content from a html document. This function will remove:
19
/// 1. <%=%>
20
/// 2. script
21
/// 3. style
22
/// 4. html tags
23
/// 6. and others
24
/// 7. html comments
25
/// After all above removed, \r\n will be replaced by an empty character.
26
/// </summary>
27
/// <param name="strHtml">string:Waiting for striping html,javascript, style elements</param>
28
/// <returns>string: Stripped text</returns>
29
public static string ExtractContent(string strHtml)
30
{
31
//All the regular expression for matching html, javascript, style elements and others.
32
string[] aryRegex ={@"<%=[\w\W]*?%>", @"<script[\w\W]*?</script>", @"<style[\w\W]*?</style>", @"<[/]?[\w\W]*?>", @"([\r\n])[\s]+",
33
@"&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);",
34
@"&#(\d+);", @"-->", @"<!--.*\n"};
35
//Corresponding replacment to the regular expressions.
36
//string[] aryReplacment = { "", "", "", "", "", " ", "\xa1", "\xa2", "\xa3", "\xa9", "", "\r\n", "" };
37
string[] aryReplacment = { "", "", "", "", "", " ", "", "", "", "", "", "", "" };
38
string strStripped = strHtml;
39
//Loop to replacing.
40
for (int i = 0; i < aryRegex.Length; i++)
41
{
42
Regex regex = new Regex(aryRegex[i], RegexOptions.IgnoreCase);
43
strStripped = regex.Replace(strStripped, aryReplacment[i]);
44
}
45
//Replace "\r\n" to an empty character.
46
strStripped.Replace("\r\n", "");
47
strStripped.Replace("\t", "");
48
//Return stripped string.
49
return strStripped;
50
}
51
public static string ExtractTitle(string strHtml)
52
{
53
54
string title;
55
//string titleResult;
56
Match m;
57
string titlePatern = @"<title[^>]*?>.*?</title>";
58
Regex regex = new Regex(titlePatern, RegexOptions.IgnoreCase);
59
m = regex.Match(strHtml);
60
if (m.Success)
61
{
62
title = m.Value.ToString();
63
title = title.Replace("<title>", "");
64
title = title.Replace("</title>", "");
65
}
66
else title = "无标题";
67
68
return title;
69
}
70
/// <summary>
71
/// 此私有方法从一段HTML文本中提取出一定字数的纯文本
72
/// </summary>
73
/// <param name="instr">HTML代码</param>
74
/// <param name="firstN">提取从头数多少个字</param>
75
/// <param name="withLink">是否要链接里面的字</param>
76
/// <returns>纯文本</returns>
77
public static string getFirstNchar(string instr, int firstN, bool withLink)
78
{
79
string strStripped;
80
strStripped = instr.Clone() as string;
81
strStripped = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
82
strStripped = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
83
strStripped = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
84
if (!withLink) strStripped = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
85
Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);
86
strStripped = objReg.Replace(strStripped, "");
87
Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
88
strStripped = objReg2.Replace(strStripped, " ");
89
//return strStripped.Length > firstN ? strStripped.Substring(0, firstN) : strStripped;
90
return strStripped;
91
}
92
93
public static string getTitle(string strHtml)
94
{
95
string title="";
96
Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
97
Match mc = reg.Match(strHtml);
98
if (mc.Success)
99
title = mc.Groups["title"].Value.Trim();
100
101
return title;
102
}
103
}
104
public class Htmlpage
105
{
106
public static string GetTitle(string strHtml)
107
{
108
MIL.Html.HtmlDocument documnet;
109
HtmlParser parser = new HtmlDomainTreeParser();
110
documnet = parser.Parse(strHtml);
111
StringBuilder text = new StringBuilder("");
112
foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
113
{
114
115
HtmlText textNode;
116
textNode = (HtmlText)node;
117
if (!textNode.Text.Contains("\r") && !textNode.Text.Contains("\n"))
118
{
119
text.Append(textNode.Text);
120
break;
121
}
122
123
}
124
return text.ToString();
125
126
}
127
public static string GetContent(string strHtml)
128
{
129
MIL.Html.HtmlDocument documnet;
130
HtmlParser parser = new HtmlDomainTreeParser();
131
documnet = parser.Parse(strHtml);
132
StringBuilder text = new StringBuilder();
133
foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
134
{
135
136
HtmlText textNode;
137
textNode = (HtmlText)node;
138
if (textNode.Text.Contains("\r") || textNode.Text.Contains("\n"))
139
continue;
140
else text.Append(textNode.Text);
141
142
}
143
return text.ToString();
144
145
}
146
}
147
148
}
149
using System;2
using System.Collections.Generic;3
using System.Text;4
using System.Text.RegularExpressions;5
using MIL.Html;6

7
namespace Yuanso.Sitework.Crawler8
{9
public class HtmlUtil10
{11
/// <summary>12
/// Written: [CHINA] Zhang Liu 13
/// Date: 1,Jun,2006 14
/// Version: 1.015
/// Support: MYBASK <see cref="http://www.mybask.net"/>16
/// Looking for latest version or similar implementation of this function, please visit: <seealso cref="http://www.mybask.net"/>17
/// Summary:18
/// Picking up text content from a html document. This function will remove:19
/// 1. <%=%>20
/// 2. script21
/// 3. style22
/// 4. html tags23
/// 6. and others24
/// 7. html comments25
/// After all above removed, \r\n will be replaced by an empty character.26
/// </summary>27
/// <param name="strHtml">string:Waiting for striping html,javascript, style elements</param>28
/// <returns>string: Stripped text</returns>29
public static string ExtractContent(string strHtml)30
{31
//All the regular expression for matching html, javascript, style elements and others.32
string[] aryRegex ={@"<%=[\w\W]*?%>", @"<script[\w\W]*?</script>", @"<style[\w\W]*?</style>", @"<[/]?[\w\W]*?>", @"([\r\n])[\s]+",33
@"&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);",34
@"&#(\d+);", @"-->", @"<!--.*\n"};35
//Corresponding replacment to the regular expressions.36
//string[] aryReplacment = { "", "", "", "", "", " ", "\xa1", "\xa2", "\xa3", "\xa9", "", "\r\n", "" };37
string[] aryReplacment = { "", "", "", "", "", " ", "", "", "", "", "", "", "" };38
string strStripped = strHtml;39
//Loop to replacing.40
for (int i = 0; i < aryRegex.Length; i++)41
{42
Regex regex = new Regex(aryRegex[i], RegexOptions.IgnoreCase);43
strStripped = regex.Replace(strStripped, aryReplacment[i]);44
}45
//Replace "\r\n" to an empty character.46
strStripped.Replace("\r\n", "");47
strStripped.Replace("\t", "");48
//Return stripped string.49
return strStripped;50
}51
public static string ExtractTitle(string strHtml)52
{53

54
string title;55
//string titleResult;56
Match m;57
string titlePatern = @"<title[^>]*?>.*?</title>";58
Regex regex = new Regex(titlePatern, RegexOptions.IgnoreCase);59
m = regex.Match(strHtml);60
if (m.Success)61
{62
title = m.Value.ToString();63
title = title.Replace("<title>", "");64
title = title.Replace("</title>", "");65
}66
else title = "无标题";67

68
return title;69
}70
/// <summary>71
/// 此私有方法从一段HTML文本中提取出一定字数的纯文本72
/// </summary>73
/// <param name="instr">HTML代码</param>74
/// <param name="firstN">提取从头数多少个字</param>75
/// <param name="withLink">是否要链接里面的字</param>76
/// <returns>纯文本</returns>77
public static string getFirstNchar(string instr, int firstN, bool withLink)78
{79
string strStripped;80
strStripped = instr.Clone() as string;81
strStripped = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");82
strStripped = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");83
strStripped = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");84
if (!withLink) strStripped = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");85
Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);86
strStripped = objReg.Replace(strStripped, "");87
Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);88
strStripped = objReg2.Replace(strStripped, " ");89
//return strStripped.Length > firstN ? strStripped.Substring(0, firstN) : strStripped;90
return strStripped;91
}92

93
public static string getTitle(string strHtml)94
{95
string title="";96
Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);97
Match mc = reg.Match(strHtml);98
if (mc.Success)99
title = mc.Groups["title"].Value.Trim();100

101
return title;102
}103
}104
public class Htmlpage105
{106
public static string GetTitle(string strHtml)107
{108
MIL.Html.HtmlDocument documnet;109
HtmlParser parser = new HtmlDomainTreeParser();110
documnet = parser.Parse(strHtml);111
StringBuilder text = new StringBuilder("");112
foreach (HtmlNode node in documnet.Nodes.FindAllText(true))113
{114

115
HtmlText textNode;116
textNode = (HtmlText)node;117
if (!textNode.Text.Contains("\r") && !textNode.Text.Contains("\n"))118
{119
text.Append(textNode.Text);120
break;121
}122
123
}124
return text.ToString();125

126
}127
public static string GetContent(string strHtml)128
{129
MIL.Html.HtmlDocument documnet;130
HtmlParser parser = new HtmlDomainTreeParser();131
documnet = parser.Parse(strHtml);132
StringBuilder text = new StringBuilder();133
foreach (HtmlNode node in documnet.Nodes.FindAllText(true))134
{135

136
HtmlText textNode;137
textNode = (HtmlText)node;138
if (textNode.Text.Contains("\r") || textNode.Text.Contains("\n"))139
continue;140
else text.Append(textNode.Text);141

142
}143
return text.ToString();144

145
}146
}147

148
}149



浙公网安备 33010602011771号