中英文字符串截取大比拼

2009-04-04 20:23 yearN 阅读(703) 评论(3) 编辑收藏举报

今天忙里偷闲，在网上收集了多个中英文字符串截取算法，测试一下它们哪个算法最优，拿来与大家分享！

测试程序如下：

char[] sArr = new char[10000];

for (int i = 0; i < 10000; i++)

{

if (i % 2 == 0)

sArr[i] = 'A';

else

sArr[i] = 'B';

}

string s = new string(sArr);

for (int i = 0; i < sArr.Length; i++)

{

if (i % 10 == 0)

s = s.Insert(i, "中国人");

}

这个长度测试起来，够可以了吧，呵呵！

先看一个比较慢的算法：

public static string Intercept2(string input, int length)

{

string res = String.Empty;

int bytecount = System.Text.Encoding.GetEncoding("GB2312").GetByteCount(input);

if (length >= bytecount)

{

return input;

}

for (int i = input.Length - 1; i >= 0; i--)

{

if (System.Text.Encoding.GetEncoding("GB2312").GetByteCount(input.Substring(0, i)) <= length)

{

return input.Substring(0, i);

}

return string.Empty;

}

这个算法的用时我没耐心等，估计用时要超过一分钟，有兴趣的朋友可以去等等看看。

再看下一个：

public static string Intercept(string input, int p)

{

Encoding encode = Encoding.GetEncoding("gb2312");

byte[] byteArr = encode.GetBytes(input);

if (byteArr.Length <= p) return input;

int m = 0, n = 0;

foreach (byte b in byteArr)

{

if (n >= p) break;

if (b > 127) m++; //重要一步：对前p个字节中的值大于127的字符进行统计

n++;

}

if (m % 2 != 0) n = p + 1; //如果非偶：则说明末尾为双字节字符，截取位数加1

return encode.GetString(byteArr, 0, n);

}

这个按奇偶位判断的算法耗时1784毫秒左右，我们再看一下：

static string HalfSubstring(string str, int strLength)

{

if (System.Text.Encoding.Unicode.GetByteCount(str) < strLength)

return str;

byte[] bytesStr = System.Text.Encoding.Unicode.GetBytes(str);

List<byte> list = new List<byte>();

int count = 0;

for (int i = 0; i < bytesStr.Length; i += 2)

{

if (count == strLength)

break;

if (bytesStr[i + 1] == 0)

{

if (count + 1 == strLength)

{

list.Add(46);

list.Add(0);

count++;

}

else

{

list.Add(bytesStr[i]);

list.Add(bytesStr[i + 1]);

count++;

}

else

{

if (count + 2 > strLength)

{

list.Add(46);

list.Add(0);

count++;

}

else if (count + 2 == strLength)

{

list.Add(46);

list.Add(0);

list.Add(46);

list.Add(0);

count += 2;

}

else

{

list.Add(bytesStr[i]);

list.Add(bytesStr[i + 1]);

count += 2;

}

return System.Text.Encoding.Unicode.GetString(list.ToArray());

}

这个算法用时540毫秒左右，比着上一个要快一倍多，不过还有更快的，

请看下一个：

public static string Intercept1(string input, int length)

{

if (input.Length==0)

return string.Empty;

if (input.Length <= length)

return input;

int total = 0;

StringBuilder temp = new StringBuilder();

for (int i = 0; i < input.Length; i++)

{

if (total >= (length - 1)) break;

string s = input.Substring(i, 1);

temp.Append(s);

total += Encoding.Default.GetByteCount(s);

}

temp.Append("...");

return temp.ToString();

}

这个算法明显要比上一个要快很多，不过你认为它是我们今天字符串截取算法大比拼中的冠军了吗，请看下一个：

public static string Truncate(string original, int length)

{

int len = original.Length;

int i = 0;

for (; i < length && i < len; ++i)

{

if ((int)(original[i]) > 0xFF)

--length;

}

if (length < i)

length = i;

else if (length > len)

length = len;

return original.Substring(0, length);

}

这个算法用时22毫秒左右，算是一个很精炼的算法了。这个算法已经可以说是我个这次大比拼中的冠军了，不过也凑巧，我们这次的目的是截取一个字符串的中文或者英文部分，又一个比较巧妙的算法出现了：

先定义一个枚举：

/// <summary>

/// 截取字符枚举值,Varchar--英文一个字节，中文两个字节，NVarchar--无论中英文都是两个字节

/// </summary>

public enum CutType

{

Varchar,

NVarchar

}

再看算法：

/// <summary>

/// 要截取的字节数

/// </summary>

/// <param name="value">输入的字符串</param>

/// <param name="length">限定长度</param>

/// <param name="ellipsis">是否需要省略号,true--需要，false--不需要</param>

/// <param name="cuttype">截取类型</param>

/// <returns>截取后的字符串，如果是NVarchar--则20个字节就会有10个字符，Varchar--20个字节会有>=10个字符</returns>

public static string CutString(string value, int length, bool ellipsis, CutType cuttype)

{

value = value.Trim();

if (value.Length == 0)

return string.Empty;

if (cuttype == CutType.NVarchar)

{

if (value.Length > length / 2)

{

value = value.Substring(0, length / 2);

if (ellipsis)

return value + "..";

}

else

{

string resultString = string.Empty;

byte[] myByte = System.Text.Encoding.GetEncoding("gbk").GetBytes(value);

if (myByte.Length > length)

{

resultString = Encoding.GetEncoding("gbk").GetString(myByte, 0, length);

string lastChar = resultString.Substring(resultString.Length - 1, 1);

if (lastChar.Equals(value.Substring(resultString.Length - 1, 1)))

{ value = resultString; }//如果截取后最后一个字符与原始输入字符串中同一位置的字符相等，则表示截取完成

else//如果不相等，则减去一个字节再截取

{

value = Encoding.GetEncoding("gbk").GetString(myByte, 0, length - 1);

}

if (ellipsis)

return value + "..";

return value;

}

return value;

}

说实话，这个算法真的是胜之不武，别人都是通过减少拆箱和装箱来提高性能的，而它干脆连循环也不要了，让前面诸位真的无语了。

最后再介绍一个截短字符串的方法：

/// <summary>

/// 截短字串的函数

/// </summary>

/// <param name="mText">要加工的字串</param>

/// <param name="byteCount">长度</param>

/// <returns>被加工过的字串</returns>

public static string Left(string mText, int byteCount)

{

if (byteCount < 1)

return mText;

if (System.Text.Encoding.Default.GetByteCount(mText) <= byteCount)

{

return mText;

}

else

{

byte[] txtBytes = System.Text.Encoding.Default.GetBytes(mText);

byte[] newBytes = new byte[byteCount - 4];

for (int i = 0; i < byteCount - 4; i++)

{

newBytes[i] = txtBytes[i];

}

string OutPut = System.Text.Encoding.Default.GetString(newBytes) + "...";

if (OutPut.EndsWith("?...") == true)

{

OutPut = OutPut.Substring(0, OutPut.Length - 4);

OutPut += "...";

}

return OutPut;

}

本程序测试结果是通过本人的机器测试，因机器不同，测试时间不同，测试结果难免也有差异，不过同一个算法的差别不是很大。下面我给出测试的源程序：

class Program

{

//本示例测试方法相同

static void Main(string[] args)

{

char[] sArr = new char[10000];

for (int i = 0; i < 10000; i++)

{

if (i % 2 == 0)

sArr[i] = 'A';

else

sArr[i] = 'B';

}

string s = new string(sArr);

for (int i = 0; i < sArr.Length; i++)

{

if (i % 10 == 0)

s = s.Insert(i, "中国人");

}

System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();

sw.Start();

for (int i = 0; i < 10000; i++)//这个算法用时：22毫秒左右

Truncate(s, 255);

sw.Stop();

Console.WriteLine("字符串截取的方法Truncate: " + sw.Elapsed.TotalMilliseconds + "ms");

sw.Reset();

sw.Start();

for (int i = 0; i < 10000; i++) //这个用时271毫秒左右

Intercept1(s, 255);

sw.Stop();

Console.WriteLine("字符串截取的方法Intercept1: " + sw.Elapsed.TotalMilliseconds + "ms");

sw.Reset();

sw.Start();

for (int i = 0; i < 10000; i++) //这个用时540毫秒左右

HalfSubstring(s, 255);

sw.Stop();

Console.WriteLine("字符串截取的方法HalfSubstring: " + sw.Elapsed.TotalMilliseconds + "ms");

sw.Reset();

sw.Start();

for (int i = 0; i < 10000; i++) //这个用时1784毫秒左右

Intercept(s, 255);

sw.Stop();

Console.WriteLine("按奇偶位判断的方法Intercept: " + sw.Elapsed.TotalMilliseconds + "ms");

//sw.Reset();

//sw.Start();

//for (int i = 0; i < 10000; i++)//这个慢的吓人，我没耐心等...

// Intercept2(s, 255);

//sw.Stop();

//Console.WriteLine("网友提供的方法: " + sw.Elapsed.TotalMilliseconds + "ms");

Console.Read();

}

//这个算法用时：22毫秒左右

public static string Truncate(string original, int length)

{

int len = original.Length;

int i = 0;

for (; i < length && i < len; ++i)

{

if ((int)(original[i]) > 0xFF)

--length;

}

if (length < i)

length = i;

else if (length > len)

length = len;

return original.Substring(0, length);

}

//这个用时1784毫秒左右

public static string Intercept(string input, int p)

{

Encoding encode = Encoding.GetEncoding("gb2312");

byte[] byteArr = encode.GetBytes(input);

if (byteArr.Length <= p) return input;

int m = 0, n = 0;

foreach (byte b in byteArr)

{

if (n >= p) break;

if (b > 127) m++; //重要一步：对前p个字节中的值大于127的字符进行统计

n++;

}

if (m % 2 != 0) n = p + 1; //如果非偶：则说明末尾为双字节字符，截取位数加1

return encode.GetString(byteArr, 0, n);

}

//这个用时271毫秒左右

public static string Intercept1(string input, int length)

{

if (input.Length==0)

return string.Empty;

if (input.Length <= length)

return input;

int total = 0;

StringBuilder temp = new StringBuilder();

for (int i = 0; i < input.Length; i++)

{

if (total >= (length - 1)) break;

string s = input.Substring(i, 1);

temp.Append(s);

total += Encoding.Default.GetByteCount(s);

}

temp.Append("...");

return temp.ToString();

}

//这个慢的吓人，我没耐心等...

public static string Intercept2(string input, int length)

{

string res = String.Empty;

int bytecount = System.Text.Encoding.GetEncoding("GB2312").GetByteCount(input);

if (length >= bytecount)

{

return input;

}

for (int i = input.Length - 1; i >= 0; i--)

{

if (System.Text.Encoding.GetEncoding("GB2312").GetByteCount(input.Substring(0, i)) <= length)

{

return input.Substring(0, i);

}

return string.Empty;

}

//这个用时540毫秒左右

static string HalfSubstring(string str, int strLength)

{

if (System.Text.Encoding.Unicode.GetByteCount(str) < strLength)

return str;

byte[] bytesStr = System.Text.Encoding.Unicode.GetBytes(str);

List<byte> list = new List<byte>();

int count = 0;

for (int i = 0; i < bytesStr.Length; i += 2)

{

if (count == strLength)

break;

if (bytesStr[i + 1] == 0)

{

if (count + 1 == strLength)

{

list.Add(46);

list.Add(0);

count++;

}

else

{

list.Add(bytesStr[i]);

list.Add(bytesStr[i + 1]);

count++;

}

else

{

if (count + 2 > strLength)

{

list.Add(46);

list.Add(0);

count++;

}

else if (count + 2 == strLength)

{

list.Add(46);

list.Add(0);

list.Add(46);

list.Add(0);

count += 2;

}

else

{

list.Add(bytesStr[i]);

list.Add(bytesStr[i + 1]);

count += 2;

}

return System.Text.Encoding.Unicode.GetString(list.ToArray());

}

/// <summary>

/// 要截取的字节数

/// </summary>

/// <param name="value">输入的字符串</param>

/// <param name="length">限定长度</param>

/// <param name="ellipsis">是否需要省略号,true--需要，false--不需要</param>

/// <param name="cuttype">截取类型</param>

/// <returns>截取后的字符串，如果是NVarchar--则20个字节就会有10个字符，Varchar--20个字节会有>=10个字符</returns>

public static string CutString(string value, int length, bool ellipsis, CutType cuttype)

{

value = value.Trim();

if (value.Length == 0)

return string.Empty;

if (cuttype == CutType.NVarchar)

{

if (value.Length > length / 2)

{

value = value.Substring(0, length / 2);

if (ellipsis)

return value + "..";

}

else

{

string resultString = string.Empty;

byte[] myByte = System.Text.Encoding.GetEncoding("gbk").GetBytes(value);

if (myByte.Length > length)

{

resultString = Encoding.GetEncoding("gbk").GetString(myByte, 0, length);

string lastChar = resultString.Substring(resultString.Length - 1, 1);

if (lastChar.Equals(value.Substring(resultString.Length - 1, 1)))

{ value = resultString; }//如果截取后最后一个字符与原始输入字符串中同一位置的字符相等，则表示截取完成

else//如果不相等，则减去一个字节再截取

{

value = Encoding.GetEncoding("gbk").GetString(myByte, 0, length - 1);

}

if (ellipsis)

return value + "..";

return value;

}

return value;

}

#region 截短字串的函数，分区中英文

/// <summary>

/// 截短字串的函数

/// </summary>

/// <param name="mText">要加工的字串</param>

/// <param name="byteCount">长度</param>

/// <returns>被加工过的字串</returns>

public static string Left(string mText, int byteCount)

{

if (byteCount < 1)

return mText;

if (System.Text.Encoding.Default.GetByteCount(mText) <= byteCount)

{

return mText;

}

else

{

byte[] txtBytes = System.Text.Encoding.Default.GetBytes(mText);

byte[] newBytes = new byte[byteCount - 4];

for (int i = 0; i < byteCount - 4; i++)

{

newBytes[i] = txtBytes[i];

}

string OutPut = System.Text.Encoding.Default.GetString(newBytes) + "...";

if (OutPut.EndsWith("?...") == true)

{

OutPut = OutPut.Substring(0, OutPut.Length - 4);

OutPut += "...";

}

return OutPut;

}

#endregion

}

/// <summary>

/// 截取字符枚举值,Varchar--英文一个字节，中文两个字节，NVarchar--无论中英文都是两个字节

/// </summary>

public enum CutType

{

Varchar,

NVarchar

}

会员力量，点亮园子希望

刷新页面返回顶部

非淡泊无以明<span style="color:red">志</span>，非宁静无以致<span style="color:red">远</span> 人之所以能，是相信能！

中英文字符串截取大比拼

About