1 /**
2 * 判断文件的编码格式
3 * @param fileName :file
4 * @return
5 * @return 文件编码格式
6 * @throws Exception
7 */
8 public String codeString(String fileName) throws Exception{
9 String charset = "GBK";
10 byte[] first3Bytes = new byte[3];
11 try {
12 boolean checked = false;
13 BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fileName));
14 bis.mark(0);
15 int read = bis.read(first3Bytes, 0, 3);
16 if (read == -1) {
17 return charset; // 文件编码为 ANSI
18 } else if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
19 charset = "UTF-16LE"; // 文件编码为 Unicode
20 checked = true;
21 } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {
22 charset = "UTF-16BE"; // 文件编码为 Unicode big endian
23 checked = true;
24 } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB
25 && first3Bytes[2] == (byte) 0xBF) {
26 charset = "UTF-8"; // 文件编码为 UTF-8
27 checked = true;
28 }
29 bis.reset();
30 if (!checked) {
31 int loc = 0;
32 while ((read = bis.read()) != -1) {
33 loc++;
34 if (read >= 0xF0) {
35 break;
36 }
37 if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK
38 {
39 break;
40 }
41 if (0xC0 <= read && read <= 0xDF) {
42 read = bis.read();
43 if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
44 // (0x80
45 // - 0xBF),也可能在GB编码内
46 {
47 continue;
48 } else {
49 break;
50 }
51 } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小
52 read = bis.read();
53 if (0x80 <= read && read <= 0xBF) {
54 read = bis.read();
55 if (0x80 <= read && read <= 0xBF) {
56 charset = "UTF-8";
57 break;
58 } else {
59 break;
60 }
61 } else {
62 break;
63 }
64 }
65 }
66 }
67 bis.close();
68 } catch (Exception e) {
69 e.printStackTrace();
70 }
71 return charset;
72 }