hadoop 中文乱码问题解决


其实hadoop原本在操作的时候一直操作的是 字节组的载入。问题出在我们手写部分 Text.toString() 方法,他被默认转换成UTF8格式的了。

public class GbkOutputFormat<K, V> extends FileOutputFormat<K, V> {	

	  protected static class LineRecordWriter<K, V>
	    extends RecordWriter<K, V> {
	    private static final String gbk = "gbk";
	    private static final byte[] newline;
	    static {
	      try {
	        newline = "\n".getBytes(gbk);
	      } catch (UnsupportedEncodingException uee) {
	        throw new IllegalArgumentException("can't find " + gbk + " encoding");

	    protected DataOutputStream out;
	    private final byte[] keyValueSeparator;

	    public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
	      this.out = out;
	      try {
	        this.keyValueSeparator = keyValueSeparator.getBytes(gbk);
	      } catch (UnsupportedEncodingException uee) {
	        throw new IllegalArgumentException("can't find " + gbk + " encoding");

	    public LineRecordWriter(DataOutputStream out) {
	      this(out, "\t");

	     * Write the object to the byte stream, handling Text as a special
	     * case.
	     * @param o the object to print
	     * @throws IOException if the write throws, we pass it on
	    private void writeObject(Object o) throws IOException {
	      if (o instanceof Text) {

	    public synchronized void write(K key, V value)
	      throws IOException {

	      boolean nullKey = key == null || key instanceof NullWritable;
	      boolean nullValue = value == null || value instanceof NullWritable;
	      if (nullKey && nullValue) {
	      if (!nullKey) {
	      if (!(nullKey || nullValue)) {
	      if (!nullValue) {

	    public synchronized 
	    void close(TaskAttemptContext context) throws IOException {

	  public RecordWriter<K, V> 
	         getRecordWriter(TaskAttemptContext job
	                         ) throws IOException, InterruptedException {
	    Configuration conf = job.getConfiguration();
	    boolean isCompressed = getCompressOutput(job);
	    String keyValueSeparator= conf.get("mapred.textoutputformat.separator",
	    CompressionCodec codec = null;
	    String extension = "";
	    if (isCompressed) {
	      Class<? extends CompressionCodec> codecClass = 
	        getOutputCompressorClass(job, GzipCodec.class);
	      codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
	      extension = codec.getDefaultExtension();
	    Path file = getDefaultWorkFile(job, extension);
	    FileSystem fs = file.getFileSystem(conf);
	    if (!isCompressed) {
	      FSDataOutputStream fileOut = fs.create(file, false);
	      return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
	    } else {
	      FSDataOutputStream fileOut = fs.create(file, false);
	      return new LineRecordWriter<K, V>(new DataOutputStream


手写的mapper 方法中要注意转换成GBK格式,以确保从头到尾都是GBK操作。

String str=new String(value.getBytes[],"GBK") ;




posted @ 2013-03-09 22:22  肉馅饺子  阅读(2088)  评论(2编辑  收藏  举报