关于操作字符串的简单探讨。
始于之前的阿里面试,根据面试官的提示,重写了三遍方法才写对,教条?不过我对此有另外的看法。
如下代码,三种不同的截取方式,前面两种都比面试官所说的方法快,速度并且不在同一个量级。
而以字节操作又比以字符操作略慢,其原因在于以字节操作的最后要求new String,从源码可知其会导致重新按字符集编码。
可知,快速的操作字符串,根本的手段是以JAVA字符(char)形式进行操作,其次是字节形式,在字节与字符串之间互操作是性能最差的方式。
package Custom; import java.io.UnsupportedEncodingException; import java.util.Random; public class Example { /** * * <pre> * 以字符字节的形式操作字节截取。 * </pre> * * @param src * @param length * @param charset * @return * @throws UnsupportedEncodingException */ public static String subStringInChars(String src, int length, String charset) throws UnsupportedEncodingException { int srcLength = src.length(); int i = 0; int len = 0; int rollback = 0; while (true) { if (len >= length || length>=srcLength) { break; } if (src.charAt(i) < 256) { rollback=0; len += 1; } else { rollback=1; len += 2; } i+=1; } if (len!=length){ i-=rollback; } return src.substring(0, i); } /** * * <pre> * 逐字节截取字符。 * </pre> * * @param src * @param length * @param charset * @return * @throws UnsupportedEncodingException */ public static String subStringInBytes(String src, int length, String charset) throws UnsupportedEncodingException { if (charset == null) { charset = "GBK"; } int firstStartScope = 129 - 1; int firstEndScope = 254 + 1; int secondStartScope = 64 - 1; int secondEndScope = 254 + 1; byte[] bytes = src.getBytes(charset); int i = 0; while (i < bytes.length) { int b1 = bytes[i] & 0xFF; int b2 = bytes[i+1] & 0xFF; if (b1 > firstStartScope && b1 < firstEndScope && b2 > secondStartScope && b2 < secondEndScope) { if (i+1==length){ i=length-1; break; }else if (i+1>length){ i=length; break; } i+=2; }else{ i+=1; if (i==length){ break; } } } return new String(bytes, 0, i, charset); } /** * * <pre> * 面试官认可的操作方法 * </pre> * * @param str * @param subSLength * @param charset * @return * @throws UnsupportedEncodingException */ public static String subStringInStrings(String str, int subSLength, String charset) throws UnsupportedEncodingException { if (charset == null) { charset = "GBK"; } if (str == null) { return ""; } else { int tempSubLength = subSLength;//截取字节数 int strLength = str.length(); String subStr = str.substring(0, strLength < subSLength ? strLength : subSLength);//截取的子串 int subStrByetsL = subStr.getBytes(charset).length;//截取子串的字节长度 while (subStrByetsL > tempSubLength) { int subSLengthTemp = --subSLength; subStr = str.substring(0, subSLengthTemp > strLength ? strLength : subSLengthTemp); subStrByetsL = subStr.getBytes(charset).length; //subStrByetsL = subStr.getBytes().length; } return subStr; } } private static Random random = new Random(); public static String getRandomString(int randomLength) { // String base = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; int strLength = random.nextInt(randomLength); while (strLength < randomLength / 2) { strLength = random.nextInt(randomLength); } char[] chars = new char[strLength]; for (int i = 0; i < chars.length; i++) { if (random.nextInt(3) > 0) { chars[i] = (char) random.nextInt(Byte.MAX_VALUE);//base.charAt(random.nextInt(base.length())); }else{ chars[i] = (char) (0x4e00 + (int) (Math.random() * (0x9fa5 - 0x4e00 + 1))); } } return new String(chars); } public static void main(String[] args) throws UnsupportedEncodingException { long start = System.currentTimeMillis(); long s1 = 0; long s2 = 0; long s3 = 0; Random random = new Random(); int randomLength = 10000; for (int i = 0; i < 100; i++) { String str = getRandomString(randomLength); // String str = "我ABC汗"; // System.out.println(str); if (str.length() == 0) { continue; } int randomLeng = random.nextInt(str.length()); // int randomLeng = 6; // subStringInBytes(str, randomLeng, null); start = System.currentTimeMillis(); String str1 = subStringInBytes(str, randomLeng, null); // System.out.println(str1); s1 += (System.currentTimeMillis() - start); start = System.currentTimeMillis(); String str2 = subStringInStrings(str, randomLeng, null); // System.out.println(str2); s2 += (System.currentTimeMillis() - start); start = System.currentTimeMillis(); String str3 = subStringInChars(str, randomLeng, null); // System.out.println(str3); s3 += (System.currentTimeMillis() - start); if (!str1.equals(str2) || !str2.equals(str3) || !str1.equals(str3)) { System.out.println(i); System.out.println(str); System.out.println(randomLeng); System.out.println(str1); System.out.println(str2); System.out.println(str3); break; } } System.out.println("字节截取:"+s1); System.out.println("字符串截取:"+s2); System.out.println("字符截取:"+s3); } }
总共执行100次,每次的字符长度至少5000,不超过1W。
结果说明了一切(Intel Core E5700+4G):
字节截取:15
字符串截取:5110
字符截取:0
逐字符处理完胜,按字节处理性能也不赖.
浙公网安备 33010602011771号