代码改变世界

IO 流读取文件时候出现乱码 文件编码格式问题 怎么转换解决方法

2014-04-14 23:36  fuimaz  阅读(474)  评论(0)    收藏  举报

在使用下面这个写法时候UTF-8文件编码 在读取时候出现乱码问题。 File myFile=new File("文件路径");

Java代码 复制代码 收藏代码
  1. BufferedReader in = new BufferedReader(new FileReader(myFile));  
 BufferedReader in = new BufferedReader(new FileReader(myFile));

应该修改为:

Java代码 复制代码 收藏代码
  1. BufferedReader in = new BufferedReader(new InputStreamReader(   
  2.                 new FileInputStream(myFile), "UTF-8"));  
 BufferedReader in = new BufferedReader(new InputStreamReader(
					new FileInputStream(myFile), "UTF-8"));

如果使用INSA编码时候 请使用下面文件读取方式:

Java代码 复制代码 收藏代码
  1. InputStreamReader reader = new InputStreamReader(   
  2.                         new FileInputStream(new File("文件路径")), "gb2312");  
InputStreamReader reader = new InputStreamReader(
						new FileInputStream(new File("文件路径")), "gb2312");

下面是我对文件编码的判断方法:

Java代码 复制代码 收藏代码
  1. /**  
  2.      * 上传文件编码判断  
  3.      * */  
  4.     public static String get_charset(File file) {   
  5.         String charset = "GBK";   
  6.         byte[] first3Bytes = new byte[3];   
  7.         try {   
  8.             boolean checked = false;   
  9.             ;   
  10.             BufferedInputStream bis = new BufferedInputStream(   
  11.                     new FileInputStream(file));   
  12.             bis.mark(0);   
  13.             int read = bis.read(first3Bytes, 03);   
  14.             if (read == -1)   
  15.                 return charset;   
  16.             if (first3Bytes[0] == (byte0xFF && first3Bytes[1] == (byte0xFE) {   
  17.                 charset = "UTF-16LE";   
  18.                 checked = true;   
  19.             } else if (first3Bytes[0] == (byte0xFE  
  20.                     && first3Bytes[1] == (byte0xFF) {   
  21.                 charset = "UTF-16BE";   
  22.                 checked = true;   
  23.             } else if (first3Bytes[0] == (byte0xEF  
  24.                     && first3Bytes[1] == (byte0xBB  
  25.                     && first3Bytes[2] == (byte0xBF) {   
  26.                 charset = "UTF-8";   
  27.                 checked = true;   
  28.             }   
  29.             bis.reset();   
  30.             if (!checked) {   
  31.                 // int len = 0;   
  32.                 int loc = 0;   
  33.   
  34.                 while ((read = bis.read()) != -1) {   
  35.                     loc++;   
  36.                     if (read >= 0xF0)   
  37.                         break;   
  38.                     if (0x80 <= read && read <= 0xBF// 单独出现BF以下的,也算是GBK   
  39.                         break;   
  40.                     if (0xC0 <= read && read <= 0xDF) {   
  41.                         read = bis.read();   
  42.                         if (0x80 <= read && read <= 0xBF// 双字节 (0xC0 - 0xDF)   
  43.                             // (0x80   
  44.                             // - 0xBF),也可能在GB编码内   
  45.                             continue;   
  46.                         else  
  47.                             break;   
  48.                     } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小   
  49.                         read = bis.read();   
  50.                         if (0x80 <= read && read <= 0xBF) {   
  51.                             read = bis.read();   
  52.                             if (0x80 <= read && read <= 0xBF) {   
  53.                                 charset = "UTF-8";   
  54.                                 break;   
  55.                             } else  
  56.                                 break;   
  57.                         } else  
  58.                             break;   
  59.                     }   
  60.                 }   
  61.   
  62.             }   
  63.   
  64.             bis.close();   
  65.         } catch (Exception e) {   
  66.             e.printStackTrace();   
  67.         }   
  68.   
  69.         return charset;   
  70.     }  
/**
	 * 上传文件编码判断
	 * */
	public static String get_charset(File file) {
		String charset = "GBK";
		byte[] first3Bytes = new byte[3];
		try {
			boolean checked = false;
			;
			BufferedInputStream bis = new BufferedInputStream(
					new FileInputStream(file));
			bis.mark(0);
			int read = bis.read(first3Bytes, 0, 3);
			if (read == -1)
				return charset;
			if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
				charset = "UTF-16LE";
				checked = true;
			} else if (first3Bytes[0] == (byte) 0xFE
					&& first3Bytes[1] == (byte) 0xFF) {
				charset = "UTF-16BE";
				checked = true;
			} else if (first3Bytes[0] == (byte) 0xEF
					&& first3Bytes[1] == (byte) 0xBB
					&& first3Bytes[2] == (byte) 0xBF) {
				charset = "UTF-8";
				checked = true;
			}
			bis.reset();
			if (!checked) {
				// int len = 0;
				int loc = 0;

				while ((read = bis.read()) != -1) {
					loc++;
					if (read >= 0xF0)
						break;
					if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK
						break;
					if (0xC0 <= read && read <= 0xDF) {
						read = bis.read();
						if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
							// (0x80
							// - 0xBF),也可能在GB编码内
							continue;
						else
							break;
					} else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小
						read = bis.read();
						if (0x80 <= read && read <= 0xBF) {
							read = bis.read();
							if (0x80 <= read && read <= 0xBF) {
								charset = "UTF-8";
								break;
							} else
								break;
						} else
							break;
					}
				}

			}

			bis.close();
		} catch (Exception e) {
			e.printStackTrace();
		}

		return charset;
	}

调用时候判断编码方式UTF-8 或是 INSA编码:

Java代码 复制代码 收藏代码
  1. BufferedReader br = null;   
  2.             if (charset == "GBK") {   
  3.                 InputStreamReader reader = new InputStreamReader(   
  4.                         new FileInputStream(new File(filepath)), "gb2312");   
  5.                 br = new BufferedReader(reader);   
  6.             }   
  7.             if (charset == "UTF-8") {   
  8.                 br = new BufferedReader(new InputStreamReader(   
  9.                         new FileInputStream(filepath), "UTF-8"));   
  10.             }  
BufferedReader br = null;
			if (charset == "GBK") {
				InputStreamReader reader = new InputStreamReader(
						new FileInputStream(new File(filepath)), "gb2312");
				br = new BufferedReader(reader);
			}
			if (charset == "UTF-8") {
				br = new BufferedReader(new InputStreamReader(
						new FileInputStream(filepath), "UTF-8"));
			}