1 /*
2 * *
3 * 按字节长度截取字符串(支持截取带HTML代码样式的字符串) *
4 * @param param 将要截取的字符串参数 * @param length 截取的字节长度 *
5 * @param end 字符串末尾补上的字符串 * @return 返回截取后的字符串 */
6 public static string subStringHTML(string param, int length, string end)
7 {
8 string Pattern = null; MatchCollection m = null; StringBuilder result = new StringBuilder(); int n = 0; char temp;
9 bool isCode = false; //是不是HTML代码
10 bool isHTML = false; //是不是HTML特殊字符,如
11 char[] pchar = param.ToCharArray();
12 for (int i = 0; i < pchar.Length; i++)
13 {
14 temp = pchar[i];
15 if (temp == '<')
16 {
17 isCode = true;
18 }
19 else if (temp == '&')
20 {
21 isHTML = true;
22 }
23 else if (temp == '>' && isCode)
24 {
25 n = n - 1; isCode = false;
26 }
27 else if (temp == ';' && isHTML)
28 {
29 isHTML = false;
30 }
31 if (!isCode && !isHTML)
32 {
33 n = n + 1; //UNICODE码字符占两个字节
34 //if (System.Text.Encoding.Default.GetBytes(temp + "").Length > 1)
35 //{
36 // n = n + 1;
37 //}
38 }
39 result.Append(temp);
40 if (n >= length)
41 { break; }
42 }
43 if (result.Length > length)
44 {
45 result.Append("...");
46 }
47
48 //取出截取字符串中的HTML标记
49 string temp_result = result.ToString().Replace("(>)[^<>]*(<?)", "$1$2");
50 //去掉不需要结素标记的HTML标记
51 temp_result = temp_result.Replace(@"</?(AREA|BASE|BASEFONT|BODY|BR|COL|COLGROUP|DD|DT|FRAME|HEAD|HR|HTML|IMG|INPUT|ISINDEX|LI|LINK|META|OPTION|P|PARAM|TBODY|TD|TFOOT|TH|THEAD|TR|area|base|basefont|body|br|col|colgroup|dd|dt|frame|head|hr|html|img|input|isindex|li|link|meta|option|p|param|tbody|td|tfoot|th|thead|tr)[^<>]*/?>", ""); //去掉成对的HTML标记
52 temp_result = temp_result.Replace(@"<([a-zA-Z]+)[^<>]*>(.*?)<//1>", "$2");
53 //用正则表达式取出标记
54 Pattern = ("<([a-zA-Z]+)[^<>]*>");
55 m = Regex.Matches(temp_result, Pattern);
56 ArrayList endHTML = new ArrayList();
57 foreach (Match mt in m)
58 {
59 endHTML.Add(mt.Result("$1"));
60 }
61 //补全不成对的HTML标记
62 for (int i = endHTML.Count - 1; i >= 0; i--)
63 {
64 result.Append("</"); result.Append(endHTML[i]); result.Append(">");
65 }
66 return result.ToString();
67 }