中英文字符串截取大比拼
2009-04-04 20:23 yearN 阅读(703) 评论(3) 编辑 收藏 举报今天忙里偷闲,在网上收集了多个中英文字符串截取算法,测试一下它们哪个算法最优,拿来与大家分享!
测试程序如下:
char[] sArr = new char[10000];
for (int i = 0; i < 10000; i++)
{
if (i % 2 == 0)
sArr[i] = 'A';
else
sArr[i] = 'B';
}
string s = new string(sArr);
for (int i = 0; i < sArr.Length; i++)
{
if (i % 10 == 0)
s = s.Insert(i, "中国人");
}
这个长度测试起来,够可以了吧,呵呵!
先看一个比较慢的算法:
public static string Intercept2(string input, int length)
{
string res = String.Empty;
int bytecount = System.Text.Encoding.GetEncoding("GB2312").GetByteCount(input);
if (length >= bytecount)
{
return input;
}
for (int i = input.Length - 1; i >= 0; i--)
{
if (System.Text.Encoding.GetEncoding("GB2312").GetByteCount(input.Substring(0, i)) <= length)
{
return input.Substring(0, i);
}
}
return string.Empty;
}
这个算法的用时我没耐心等,估计用时要超过一分钟,有兴趣的朋友可以去等等看看。
再看下一个:
public static string Intercept(string input, int p)
{
Encoding encode = Encoding.GetEncoding("gb2312");
byte[] byteArr = encode.GetBytes(input);
if (byteArr.Length <= p) return input;
int m = 0, n = 0;
foreach (byte b in byteArr)
{
if (n >= p) break;
if (b > 127) m++; //重要一步:对前p个字节中的值大于127的字符进行统计
n++;
}
if (m % 2 != 0) n = p + 1; //如果非偶:则说明末尾为双字节字符,截取位数加1
return encode.GetString(byteArr, 0, n);
}
这个按奇偶位判断的算法耗时1784毫秒左右,我们再看一下:
static string HalfSubstring(string str, int strLength)
{
if (System.Text.Encoding.Unicode.GetByteCount(str) < strLength)
return str;
byte[] bytesStr = System.Text.Encoding.Unicode.GetBytes(str);
List<byte> list = new List<byte>();
int count = 0;
for (int i = 0; i < bytesStr.Length; i += 2)
{
if (count == strLength)
break;
if (bytesStr[i + 1] == 0)
{
if (count + 1 == strLength)
{
list.Add(46);
list.Add(0);
count++;
}
else
{
list.Add(bytesStr[i]);
list.Add(bytesStr[i + 1]);
count++;
}
}
else
{
if (count + 2 > strLength)
{
list.Add(46);
list.Add(0);
count++;
}
else if (count + 2 == strLength)
{
list.Add(46);
list.Add(0);
list.Add(46);
list.Add(0);
count += 2;
}
else
{
list.Add(bytesStr[i]);
list.Add(bytesStr[i + 1]);
count += 2;
}
}
}
return System.Text.Encoding.Unicode.GetString(list.ToArray());
}
这个算法用时540毫秒左右,比着上一个要快一倍多,不过还有更快的,
请看下一个:
public static string Intercept1(string input, int length)
{
if (input.Length==0)
return string.Empty;
if (input.Length <= length)
return input;
int total = 0;
StringBuilder temp = new StringBuilder();
for (int i = 0; i < input.Length; i++)
{
if (total >= (length - 1)) break;
string s = input.Substring(i, 1);
temp.Append(s);
total += Encoding.Default.GetByteCount(s);
}
temp.Append("...");
return temp.ToString();
}
这个算法明显要比上一个要快很多,不过你认为它是我们今天字符串截取算法大比拼中的冠军了吗,请看下一个:
public static string Truncate(string original, int length)
{
int len = original.Length;
int i = 0;
for (; i < length && i < len; ++i)
{
if ((int)(original[i]) > 0xFF)
--length;
}
if (length < i)
length = i;
else if (length > len)
length = len;
return original.Substring(0, length);
}
这个算法用时22毫秒左右,算是一个很精炼的算法了。这个算法已经可以说是我个这次大比拼中的冠军了,不过也凑巧,我们这次的目的是截取一个字符串的中文或者英文部分,又一个比较巧妙的算法出现了:
先定义一个枚举:
/// <summary>
/// 截取字符枚举值,Varchar--英文一个字节,中文两个字节,NVarchar--无论中英文都是两个字节
/// </summary>
public enum CutType
{
Varchar,
NVarchar
}
再看算法:
/// <summary>
/// 要截取的字节数
/// </summary>
/// <param name="value">输入的字符串</param>
/// <param name="length">限定长度</param>
/// <param name="ellipsis">是否需要省略号,true--需要,false--不需要</param>
/// <param name="cuttype">截取类型</param>
/// <returns>截取后的字符串,如果是NVarchar--则20个字节就会有10个字符,Varchar--20个字节会有>=10个字符</returns>
public static string CutString(string value, int length, bool ellipsis, CutType cuttype)
{
value = value.Trim();
if (value.Length == 0)
return string.Empty;
if (cuttype == CutType.NVarchar)
{
if (value.Length > length / 2)
{
value = value.Substring(0, length / 2);
if (ellipsis)
return value + "..";
}
}
else
{
string resultString = string.Empty;
byte[] myByte = System.Text.Encoding.GetEncoding("gbk").GetBytes(value);
if (myByte.Length > length)
{
resultString = Encoding.GetEncoding("gbk").GetString(myByte, 0, length);
string lastChar = resultString.Substring(resultString.Length - 1, 1);
if (lastChar.Equals(value.Substring(resultString.Length - 1, 1)))
{ value = resultString; }//如果截取后最后一个字符与原始输入字符串中同一位置的字符相等,则表示截取完成
else//如果不相等,则减去一个字节再截取
{
value = Encoding.GetEncoding("gbk").GetString(myByte, 0, length - 1);
}
if (ellipsis)
return value + "..";
return value;
}
}
return value;
}
说实话,这个算法真的是胜之不武,别人都是通过减少拆箱和装箱来提高性能的,而它干脆连循环也不要了,让前面诸位真的无语了。
最后再介绍一个截短字符串的方法:
/// <summary>
/// 截短字串的函数
/// </summary>
/// <param name="mText">要加工的字串</param>
/// <param name="byteCount">长度</param>
/// <returns>被加工过的字串</returns>
public static string Left(string mText, int byteCount)
{
if (byteCount < 1)
return mText;
if (System.Text.Encoding.Default.GetByteCount(mText) <= byteCount)
{
return mText;
}
else
{
byte[] txtBytes = System.Text.Encoding.Default.GetBytes(mText);
byte[] newBytes = new byte[byteCount - 4];
for (int i = 0; i < byteCount - 4; i++)
{
newBytes[i] = txtBytes[i];
}
string OutPut = System.Text.Encoding.Default.GetString(newBytes) + "...";
if (OutPut.EndsWith("?...") == true)
{
OutPut = OutPut.Substring(0, OutPut.Length - 4);
OutPut += "...";
}
return OutPut;
}
}
本程序测试结果是通过本人的机器测试,因机器不同,测试时间不同,测试结果难免也有差异,不过同一个算法的差别不是很大。下面我给出测试的源程序:
class Program
{
//本示例测试方法相同
static void Main(string[] args)
{
char[] sArr = new char[10000];
for (int i = 0; i < 10000; i++)
{
if (i % 2 == 0)
sArr[i] = 'A';
else
sArr[i] = 'B';
}
string s = new string(sArr);
for (int i = 0; i < sArr.Length; i++)
{
if (i % 10 == 0)
s = s.Insert(i, "中国人");
}
System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
sw.Start();
for (int i = 0; i < 10000; i++)//这个算法用时:22毫秒左右
Truncate(s, 255);
sw.Stop();
Console.WriteLine("字符串截取的方法Truncate: " + sw.Elapsed.TotalMilliseconds + "ms");
sw.Reset();
sw.Start();
for (int i = 0; i < 10000; i++) //这个用时271毫秒左右
Intercept1(s, 255);
sw.Stop();
Console.WriteLine("字符串截取的方法Intercept1: " + sw.Elapsed.TotalMilliseconds + "ms");
sw.Reset();
sw.Start();
for (int i = 0; i < 10000; i++) //这个用时540毫秒左右
HalfSubstring(s, 255);
sw.Stop();
Console.WriteLine("字符串截取的方法HalfSubstring: " + sw.Elapsed.TotalMilliseconds + "ms");
sw.Reset();
sw.Start();
for (int i = 0; i < 10000; i++) //这个用时1784毫秒左右
Intercept(s, 255);
sw.Stop();
Console.WriteLine("按奇偶位判断的方法Intercept: " + sw.Elapsed.TotalMilliseconds + "ms");
//sw.Reset();
//sw.Start();
//for (int i = 0; i < 10000; i++)//这个慢的吓人,我没耐心等...
// Intercept2(s, 255);
//sw.Stop();
//Console.WriteLine("网友提供的方法: " + sw.Elapsed.TotalMilliseconds + "ms");
Console.Read();
}
//这个算法用时:22毫秒左右
public static string Truncate(string original, int length)
{
int len = original.Length;
int i = 0;
for (; i < length && i < len; ++i)
{
if ((int)(original[i]) > 0xFF)
--length;
}
if (length < i)
length = i;
else if (length > len)
length = len;
return original.Substring(0, length);
}
//这个用时1784毫秒左右
public static string Intercept(string input, int p)
{
Encoding encode = Encoding.GetEncoding("gb2312");
byte[] byteArr = encode.GetBytes(input);
if (byteArr.Length <= p) return input;
int m = 0, n = 0;
foreach (byte b in byteArr)
{
if (n >= p) break;
if (b > 127) m++; //重要一步:对前p个字节中的值大于127的字符进行统计
n++;
}
if (m % 2 != 0) n = p + 1; //如果非偶:则说明末尾为双字节字符,截取位数加1
return encode.GetString(byteArr, 0, n);
}
//这个用时271毫秒左右
public static string Intercept1(string input, int length)
{
if (input.Length==0)
return string.Empty;
if (input.Length <= length)
return input;
int total = 0;
StringBuilder temp = new StringBuilder();
for (int i = 0; i < input.Length; i++)
{
if (total >= (length - 1)) break;
string s = input.Substring(i, 1);
temp.Append(s);
total += Encoding.Default.GetByteCount(s);
}
temp.Append("...");
return temp.ToString();
}
//这个慢的吓人,我没耐心等...
public static string Intercept2(string input, int length)
{
string res = String.Empty;
int bytecount = System.Text.Encoding.GetEncoding("GB2312").GetByteCount(input);
if (length >= bytecount)
{
return input;
}
for (int i = input.Length - 1; i >= 0; i--)
{
if (System.Text.Encoding.GetEncoding("GB2312").GetByteCount(input.Substring(0, i)) <= length)
{
return input.Substring(0, i);
}
}
return string.Empty;
}
//这个用时540毫秒左右
static string HalfSubstring(string str, int strLength)
{
if (System.Text.Encoding.Unicode.GetByteCount(str) < strLength)
return str;
byte[] bytesStr = System.Text.Encoding.Unicode.GetBytes(str);
List<byte> list = new List<byte>();
int count = 0;
for (int i = 0; i < bytesStr.Length; i += 2)
{
if (count == strLength)
break;
if (bytesStr[i + 1] == 0)
{
if (count + 1 == strLength)
{
list.Add(46);
list.Add(0);
count++;
}
else
{
list.Add(bytesStr[i]);
list.Add(bytesStr[i + 1]);
count++;
}
}
else
{
if (count + 2 > strLength)
{
list.Add(46);
list.Add(0);
count++;
}
else if (count + 2 == strLength)
{
list.Add(46);
list.Add(0);
list.Add(46);
list.Add(0);
count += 2;
}
else
{
list.Add(bytesStr[i]);
list.Add(bytesStr[i + 1]);
count += 2;
}
}
}
return System.Text.Encoding.Unicode.GetString(list.ToArray());
}
/// <summary>
/// 要截取的字节数
/// </summary>
/// <param name="value">输入的字符串</param>
/// <param name="length">限定长度</param>
/// <param name="ellipsis">是否需要省略号,true--需要,false--不需要</param>
/// <param name="cuttype">截取类型</param>
/// <returns>截取后的字符串,如果是NVarchar--则20个字节就会有10个字符,Varchar--20个字节会有>=10个字符</returns>
public static string CutString(string value, int length, bool ellipsis, CutType cuttype)
{
value = value.Trim();
if (value.Length == 0)
return string.Empty;
if (cuttype == CutType.NVarchar)
{
if (value.Length > length / 2)
{
value = value.Substring(0, length / 2);
if (ellipsis)
return value + "..";
}
}
else
{
string resultString = string.Empty;
byte[] myByte = System.Text.Encoding.GetEncoding("gbk").GetBytes(value);
if (myByte.Length > length)
{
resultString = Encoding.GetEncoding("gbk").GetString(myByte, 0, length);
string lastChar = resultString.Substring(resultString.Length - 1, 1);
if (lastChar.Equals(value.Substring(resultString.Length - 1, 1)))
{ value = resultString; }//如果截取后最后一个字符与原始输入字符串中同一位置的字符相等,则表示截取完成
else//如果不相等,则减去一个字节再截取
{
value = Encoding.GetEncoding("gbk").GetString(myByte, 0, length - 1);
}
if (ellipsis)
return value + "..";
return value;
}
}
return value;
}
#region 截短字串的函数,分区中英文
/// <summary>
/// 截短字串的函数
/// </summary>
/// <param name="mText">要加工的字串</param>
/// <param name="byteCount">长度</param>
/// <returns>被加工过的字串</returns>
public static string Left(string mText, int byteCount)
{
if (byteCount < 1)
return mText;
if (System.Text.Encoding.Default.GetByteCount(mText) <= byteCount)
{
return mText;
}
else
{
byte[] txtBytes = System.Text.Encoding.Default.GetBytes(mText);
byte[] newBytes = new byte[byteCount - 4];
for (int i = 0; i < byteCount - 4; i++)
{
newBytes[i] = txtBytes[i];
}
string OutPut = System.Text.Encoding.Default.GetString(newBytes) + "...";
if (OutPut.EndsWith("?...") == true)
{
OutPut = OutPut.Substring(0, OutPut.Length - 4);
OutPut += "...";
}
return OutPut;
}
}
#endregion
}
/// <summary>
/// 截取字符枚举值,Varchar--英文一个字节,中文两个字节,NVarchar--无论中英文都是两个字节
/// </summary>
public enum CutType
{
Varchar,
NVarchar
}