提出HTML代码中的文字
提出HTML代码中的文字:
1
public string StripHTML(string strHtml)2

{3

string[] aryReg =
{4
@"<script[^>]*?>.*?</script>",5
@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",6
@"([\r\n])[\s]+",7
@"&(quot|#34);",8
@"&(amp|#38);",9
@"&(lt|#60);",10
@"&(gt|#62);", 11
@"&(nbsp|#160);", 12
@"&(iexcl|#161);",13
@"&(cent|#162);",14
@"&(pound|#163);",15
@"&(copy|#169);",16
@"&#(\d+);",17
@"-->",18
@"<!--.*\n" 19
};20

21

string[] aryRep =
{22
"",23
"",24
"",25
"\"",26
"&",27
"<",28
">",29
" ",30
"\xa1",//chr(161),31
"\xa2",//chr(162),32
"\xa3",//chr(163),33
"\xa9",//chr(169),34
"",35
"\r\n",36
""37
};38

39
string newReg = aryReg[0];40
string strOutput = strHtml;41
for (int i = 0; i < aryReg.Length; i++)42

{43
Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);44
strOutput = regex.Replace(strOutput, aryRep[i]);45
}46

47
strOutput.Replace("<", "");48
strOutput.Replace(">", "");49
strOutput.Replace("\r\n", "");50

51

52
return strOutput;53
}
浙公网安备 33010602011771号