代码改变世界

中英文字符串截取大比拼

2009-04-04 20:23  yearN  阅读(703)  评论(3编辑  收藏  举报

今天忙里偷闲,在网上收集了多个中英文字符串截取算法,测试一下它们哪个算法最优,拿来与大家分享!

测试程序如下:

char[] sArr = new char[10000];

            for (int i = 0; i < 10000; i++)

            {

                if (i % 2 == 0)

                    sArr[i] = 'A';

                else

                    sArr[i] = 'B';

            }

            string s = new string(sArr);

            for (int i = 0; i < sArr.Length; i++)

            {

                if (i % 10 == 0)

                    s = s.Insert(i, "中国人");

            }

这个长度测试起来,够可以了吧,呵呵!

先看一个比较慢的算法:

   public static string Intercept2(string input, int length)

        {

            string res = String.Empty;

            int bytecount = System.Text.Encoding.GetEncoding("GB2312").GetByteCount(input);

            if (length >= bytecount)

            {

                return input;

            }

            for (int i = input.Length - 1; i >= 0; i--)

            {

                if (System.Text.Encoding.GetEncoding("GB2312").GetByteCount(input.Substring(0, i)) <= length)

                {

                    return input.Substring(0, i);

                }

            }

            return string.Empty;

        }

这个算法的用时我没耐心等,估计用时要超过一分钟,有兴趣的朋友可以去等等看看。

再看下一个:

  public static string Intercept(string input, int p)

        {

            Encoding encode = Encoding.GetEncoding("gb2312");

            byte[] byteArr = encode.GetBytes(input);

            if (byteArr.Length <= p) return input;

 

            int m = 0, n = 0;

            foreach (byte b in byteArr)

            {

                if (n >= p) break;

                if (b > 127) m++; //重要一步:对前p个字节中的值大于127的字符进行统计

                n++;

            }

            if (m % 2 != 0) n = p + 1; //如果非偶:则说明末尾为双字节字符,截取位数加1

 

            return encode.GetString(byteArr, 0, n);

        }

这个按奇偶位判断的算法耗时1784毫秒左右,我们再看一下:

static string HalfSubstring(string str, int strLength)

        {

            if (System.Text.Encoding.Unicode.GetByteCount(str) < strLength)

                return str;

            byte[] bytesStr = System.Text.Encoding.Unicode.GetBytes(str);

            List<byte> list = new List<byte>();

            int count = 0;

            for (int i = 0; i < bytesStr.Length; i += 2)

            {

                if (count == strLength)

                    break;

                if (bytesStr[i + 1] == 0)

                {

                    if (count + 1 == strLength)

                    {

                        list.Add(46);

                        list.Add(0);

                        count++;

                    }

                    else

                    {

                        list.Add(bytesStr[i]);

                        list.Add(bytesStr[i + 1]);

                        count++;

                    }

                }

                else

                {

                    if (count + 2 > strLength)

                    {

                        list.Add(46);

                        list.Add(0);

                        count++;

                    }

                    else if (count + 2 == strLength)

                    {

                        list.Add(46);

                        list.Add(0);

                        list.Add(46);

                        list.Add(0);

                        count += 2;

                    }

                    else

                    {

                        list.Add(bytesStr[i]);

                        list.Add(bytesStr[i + 1]);

                        count += 2;

                    }

                }

            }

            return System.Text.Encoding.Unicode.GetString(list.ToArray());

        }

这个算法用时540毫秒左右,比着上一个要快一倍多,不过还有更快的,

请看下一个:

        public static string Intercept1(string input, int length)

        {

            if (input.Length==0)

                return string.Empty;

            if (input.Length <= length)

                return input;

            int total = 0;

            StringBuilder temp = new StringBuilder();

            for (int i = 0; i < input.Length; i++)

            {

                if (total >= (length - 1)) break;

                string s = input.Substring(i, 1);

                temp.Append(s);

                total += Encoding.Default.GetByteCount(s);

            }

            temp.Append("...");

            return temp.ToString();

        }

这个算法明显要比上一个要快很多,不过你认为它是我们今天字符串截取算法大比拼中的冠军了吗,请看下一个:

   

        public static string Truncate(string original, int length)

        {

            int len = original.Length;

            int i = 0;

            for (; i < length && i < len; ++i)

            {

                if ((int)(original[i]) > 0xFF)

                    --length;

            }

            if (length < i)

                length = i;

            else if (length > len)

                length = len;

            return original.Substring(0, length);

        }

这个算法用时22毫秒左右,算是一个很精炼的算法了。这个算法已经可以说是我个这次大比拼中的冠军了,不过也凑巧,我们这次的目的是截取一个字符串的中文或者英文部分,又一个比较巧妙的算法出现了:

    先定义一个枚举:

/// <summary>

    /// 截取字符枚举值,Varchar--英文一个字节,中文两个字节,NVarchar--无论中英文都是两个字节

    /// </summary>

    public enum CutType

    {

        Varchar,

        NVarchar

}

再看算法:

/// <summary>

      /// 要截取的字节数

      /// </summary>

      /// <param name="value">输入的字符串</param>

      /// <param name="length">限定长度</param>

      /// <param name="ellipsis">是否需要省略号,true--需要,false--不需要</param>

      /// <param name="cuttype">截取类型</param>

      /// <returns>截取后的字符串,如果是NVarchar--20个字节就会有10个字符,Varchar--20个字节会有>=10个字符</returns>

      public static string CutString(string value, int length, bool ellipsis, CutType cuttype)

      {

          value = value.Trim();

          if (value.Length == 0)

              return string.Empty;

          if (cuttype == CutType.NVarchar)

          {

              if (value.Length > length / 2)

              {

                  value = value.Substring(0, length / 2);

                  if (ellipsis)

                      return value + "..";

              }

          }

          else

          {

              string resultString = string.Empty;

              byte[] myByte = System.Text.Encoding.GetEncoding("gbk").GetBytes(value);

              if (myByte.Length > length)

              {

                  resultString = Encoding.GetEncoding("gbk").GetString(myByte, 0, length);

                  string lastChar = resultString.Substring(resultString.Length - 1, 1);

                  if (lastChar.Equals(value.Substring(resultString.Length - 1, 1)))

                  { value = resultString; }//如果截取后最后一个字符与原始输入字符串中同一位置的字符相等,则表示截取完成

                  else//如果不相等,则减去一个字节再截取

                  {

                      value = Encoding.GetEncoding("gbk").GetString(myByte, 0, length - 1);

                  }

                  if (ellipsis)

                      return value + "..";

                  return value;

              }

          }

          return value;

      }

说实话,这个算法真的是胜之不武,别人都是通过减少拆箱和装箱来提高性能的,而它干脆连循环也不要了,让前面诸位真的无语了。

最后再介绍一个截短字符串的方法:

  /// <summary>

      /// 截短字串的函数

      /// </summary>

      /// <param name="mText">要加工的字串</param>

      /// <param name="byteCount">长度</param>

      /// <returns>被加工过的字串</returns>

      public static string Left(string mText, int byteCount)

      {

          if (byteCount < 1)

              return mText;

 

          if (System.Text.Encoding.Default.GetByteCount(mText) <= byteCount)

          {

              return mText;

          }

          else

          {

              byte[] txtBytes = System.Text.Encoding.Default.GetBytes(mText);

              byte[] newBytes = new byte[byteCount - 4];

 

              for (int i = 0; i < byteCount - 4; i++)

              {

                  newBytes[i] = txtBytes[i];

              }

              string OutPut = System.Text.Encoding.Default.GetString(newBytes) + "...";

              if (OutPut.EndsWith("?...") == true)

              {

                  OutPut = OutPut.Substring(0, OutPut.Length - 4);

                  OutPut += "...";

              }

              return OutPut;

          }

      }

本程序测试结果是通过本人的机器测试,因机器不同,测试时间不同,测试结果难免也有差异,不过同一个算法的差别不是很大。下面我给出测试的源程序:

class Program

    {

       //本示例测试方法相同

 

        static void Main(string[] args)

        {

            char[] sArr = new char[10000];

            for (int i = 0; i < 10000; i++)

            {

                if (i % 2 == 0)

                    sArr[i] = 'A';

                else

                    sArr[i] = 'B';

            }

            string s = new string(sArr);

            for (int i = 0; i < sArr.Length; i++)

            {

                if (i % 10 == 0)

                    s = s.Insert(i, "中国人");

            }

 

            System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();

            sw.Start();

            for (int i = 0; i < 10000; i++)//这个算法用时:22毫秒左右

                Truncate(s, 255);

            sw.Stop();

            Console.WriteLine("字符串截取的方法Truncate: " + sw.Elapsed.TotalMilliseconds + "ms");

 

            sw.Reset();

            sw.Start();

            for (int i = 0; i < 10000; i++) //这个用时271毫秒左右

                Intercept1(s, 255);

            sw.Stop();

            Console.WriteLine("字符串截取的方法Intercept1: " + sw.Elapsed.TotalMilliseconds + "ms");

 

            sw.Reset();

            sw.Start();

            for (int i = 0; i < 10000; i++) //这个用时540毫秒左右

                HalfSubstring(s, 255);

            sw.Stop();

            Console.WriteLine("字符串截取的方法HalfSubstring: " + sw.Elapsed.TotalMilliseconds + "ms");

 

            sw.Reset();

            sw.Start();

            for (int i = 0; i < 10000; i++) //这个用时1784毫秒左右

                Intercept(s, 255);

            sw.Stop();

 

            Console.WriteLine("按奇偶位判断的方法Intercept: " + sw.Elapsed.TotalMilliseconds + "ms");

 

            //sw.Reset();

            //sw.Start();

            //for (int i = 0; i < 10000; i++)//这个慢的吓人,我没耐心等...

            //    Intercept2(s, 255);

            //sw.Stop();

 

            //Console.WriteLine("网友提供的方法: " + sw.Elapsed.TotalMilliseconds + "ms");

            Console.Read();

        }

 

        //这个算法用时:22毫秒左右

        public static string Truncate(string original, int length)

        {

            int len = original.Length;

            int i = 0;

            for (; i < length && i < len; ++i)

            {

                if ((int)(original[i]) > 0xFF)

                    --length;

            }

            if (length < i)

                length = i;

            else if (length > len)

                length = len;

            return original.Substring(0, length);

        }

 

        //这个用时1784毫秒左右

        public static string Intercept(string input, int p)

        {

            Encoding encode = Encoding.GetEncoding("gb2312");

            byte[] byteArr = encode.GetBytes(input);

            if (byteArr.Length <= p) return input;

 

            int m = 0, n = 0;

            foreach (byte b in byteArr)

            {

                if (n >= p) break;

                if (b > 127) m++; //重要一步:对前p个字节中的值大于127的字符进行统计

                n++;

            }

            if (m % 2 != 0) n = p + 1; //如果非偶:则说明末尾为双字节字符,截取位数加1

 

            return encode.GetString(byteArr, 0, n);

        }

        //这个用时271毫秒左右

        public static string Intercept1(string input, int length)

        {

            if (input.Length==0)

                return string.Empty;

            if (input.Length <= length)

                return input;

            int total = 0;

            StringBuilder temp = new StringBuilder();

            for (int i = 0; i < input.Length; i++)

            {

                if (total >= (length - 1)) break;

                string s = input.Substring(i, 1);

                temp.Append(s);

                total += Encoding.Default.GetByteCount(s);

            }

            temp.Append("...");

            return temp.ToString();

        }

        //这个慢的吓人,我没耐心等...

        public static string Intercept2(string input, int length)

        {

            string res = String.Empty;

            int bytecount = System.Text.Encoding.GetEncoding("GB2312").GetByteCount(input);

            if (length >= bytecount)

            {

                return input;

            }

            for (int i = input.Length - 1; i >= 0; i--)

            {

                if (System.Text.Encoding.GetEncoding("GB2312").GetByteCount(input.Substring(0, i)) <= length)

                {

                    return input.Substring(0, i);

                }

            }

            return string.Empty;

        }

 

        //这个用时540毫秒左右

      static string HalfSubstring(string str, int strLength)

        {

            if (System.Text.Encoding.Unicode.GetByteCount(str) < strLength)

                return str;

            byte[] bytesStr = System.Text.Encoding.Unicode.GetBytes(str);

            List<byte> list = new List<byte>();

            int count = 0;

            for (int i = 0; i < bytesStr.Length; i += 2)

            {

                if (count == strLength)

                    break;

                if (bytesStr[i + 1] == 0)

                {

                    if (count + 1 == strLength)

                    {

                        list.Add(46);

                        list.Add(0);

                        count++;

                    }

                    else

                    {

                        list.Add(bytesStr[i]);

                        list.Add(bytesStr[i + 1]);

                        count++;

                    }

                }

                else

                {

                    if (count + 2 > strLength)

                    {

                        list.Add(46);

                        list.Add(0);

                        count++;

                    }

                    else if (count + 2 == strLength)

                    {

                        list.Add(46);

                        list.Add(0);

                        list.Add(46);

                        list.Add(0);

                        count += 2;

                    }

                    else

                    {

                        list.Add(bytesStr[i]);

                        list.Add(bytesStr[i + 1]);

                        count += 2;

                    }

                }

            }

            return System.Text.Encoding.Unicode.GetString(list.ToArray());

        }

 

 

      /// <summary>

      /// 要截取的字节数

      /// </summary>

      /// <param name="value">输入的字符串</param>

      /// <param name="length">限定长度</param>

      /// <param name="ellipsis">是否需要省略号,true--需要,false--不需要</param>

      /// <param name="cuttype">截取类型</param>

      /// <returns>截取后的字符串,如果是NVarchar--20个字节就会有10个字符,Varchar--20个字节会有>=10个字符</returns>

      public static string CutString(string value, int length, bool ellipsis, CutType cuttype)

      {

          value = value.Trim();

          if (value.Length == 0)

              return string.Empty;

          if (cuttype == CutType.NVarchar)

          {

              if (value.Length > length / 2)

              {

                  value = value.Substring(0, length / 2);

                  if (ellipsis)

                      return value + "..";

              }

          }

          else

          {

              string resultString = string.Empty;

              byte[] myByte = System.Text.Encoding.GetEncoding("gbk").GetBytes(value);

              if (myByte.Length > length)

              {

                  resultString = Encoding.GetEncoding("gbk").GetString(myByte, 0, length);

                  string lastChar = resultString.Substring(resultString.Length - 1, 1);

                  if (lastChar.Equals(value.Substring(resultString.Length - 1, 1)))

                  { value = resultString; }//如果截取后最后一个字符与原始输入字符串中同一位置的字符相等,则表示截取完成

                  else//如果不相等,则减去一个字节再截取

                  {

                      value = Encoding.GetEncoding("gbk").GetString(myByte, 0, length - 1);

                  }

                  if (ellipsis)

                      return value + "..";

                  return value;

              }

          }

          return value;

      }

      #region  截短字串的函数,分区中英文

      /// <summary>

      /// 截短字串的函数

      /// </summary>

      /// <param name="mText">要加工的字串</param>

      /// <param name="byteCount">长度</param>

      /// <returns>被加工过的字串</returns>

      public static string Left(string mText, int byteCount)

      {

          if (byteCount < 1)

              return mText;

 

          if (System.Text.Encoding.Default.GetByteCount(mText) <= byteCount)

          {

              return mText;

          }

          else

          {

              byte[] txtBytes = System.Text.Encoding.Default.GetBytes(mText);

              byte[] newBytes = new byte[byteCount - 4];

 

              for (int i = 0; i < byteCount - 4; i++)

              {

                  newBytes[i] = txtBytes[i];

              }

              string OutPut = System.Text.Encoding.Default.GetString(newBytes) + "...";

              if (OutPut.EndsWith("?...") == true)

              {

                  OutPut = OutPut.Substring(0, OutPut.Length - 4);

                  OutPut += "...";

              }

              return OutPut;

          }

      }

      #endregion

 

 

    }

    /// <summary>

    /// 截取字符枚举值,Varchar--英文一个字节,中文两个字节,NVarchar--无论中英文都是两个字节

    /// </summary>

    public enum CutType

    {

        Varchar,

        NVarchar

    }