java字符集(二)
知道java的字符集编码,那么java是怎么读取内存中的字节转换成你需要的字符的呢,其实很简单,
java中的是通过StringCoding来完成字符转换的,他是一个内嵌类,现将源代码拷至如下:

public class StringCoding
{
private StringCoding()
{
}

/**//*
* The cached coders for each thread
*/
private static ThreadLocal decoder = new ThreadLocal();
private static ThreadLocal encoder = new ThreadLocal();
private static boolean warnUnsupportedCharset = true;

private static Object deref(ThreadLocal tl)
{
SoftReference sr = (SoftReference) tl.get();
if (sr == null)
return null;
return sr.get();
}

private static void set(ThreadLocal tl, Object ob)
{
tl.set(new SoftReference(ob));
}
// Trim the given byte array to the given length
//
private static byte[] trim(byte[] ba, int len)
{
if (len == ba.length)
return ba;
byte[] tba = new byte[len];
System.arraycopy(ba, 0, tba, 0, len);
return tba;
}
// Trim the given char array to the given length
//
private static char[] trim(char[] ca, int len)
{
if (len == ca.length)
return ca;
char[] tca = new char[len];
System.arraycopy(ca, 0, tca, 0, len);
return tca;
}

private static int scale(int len, float expansionFactor)
{
// We need to perform double, not float, arithmetic; otherwise
// we lose low order bits when len is larger than 2**24.
return (int) (len * (double) expansionFactor);
}

private static Charset lookupCharset(String csn)
{
if (Charset.isSupported(csn))
{
try
{
return Charset.forName(csn);
} catch (UnsupportedCharsetException x)
{
throw new Error(x);
}
}
return null;
}

private static void warnUnsupportedCharset(String csn)
{
if (warnUnsupportedCharset)
{
// Use sun.misc.MessageUtils rather than the Logging API or
// System.err since this method may be called during VM
// initialization before either is available.
MessageUtils.err("WARNING: Default charset " + csn
+ " not supported, using ISO-8859-1 instead");
warnUnsupportedCharset = false;
}
}
// -- Decoding --
// Encapsulates either a ByteToCharConverter or a CharsetDecoder
//
private static abstract class StringDecoder
{
private final String requestedCharsetName;

protected StringDecoder(String requestedCharsetName)
{
this.requestedCharsetName = requestedCharsetName;
}

final String requestedCharsetName()
{
return requestedCharsetName;
}
abstract String charsetName();
abstract char[] decode(byte[] ba, int off, int len);
}
// A string decoder based upon a ByteToCharConverter
//
private static class ConverterSD extends StringDecoder
{
private ByteToCharConverter btc;

private ConverterSD(ByteToCharConverter btc, String rcn)
{
super(rcn);
this.btc = btc;
}

String charsetName()
{
return btc.getCharacterEncoding();
}

char[] decode(byte[] ba, int off, int len)
{
int en = scale(len, btc.getMaxCharsPerByte());
char[] ca = new char[en];
if (len == 0)
return ca;
btc.reset();
int n = 0;
try
{
n = btc.convert(ba, off, off + len, ca, 0, en);
n += btc.flush(ca, btc.nextCharIndex(), en);
} catch (CharConversionException x)
{
// Yes, this is what we've always done
n = btc.nextCharIndex();
}
return trim(ca, n);
}
}
// A string decoder based upon a CharsetDecoder
//
private static class CharsetSD extends StringDecoder
{
private final Charset cs;
private final CharsetDecoder cd;

private CharsetSD(Charset cs, String rcn)
{
super(rcn);
this.cs = cs;
this.cd = cs.newDecoder().onMalformedInput(
CodingErrorAction.REPLACE).onUnmappableCharacter(
CodingErrorAction.REPLACE);
}

String charsetName()
{
if (cs instanceof HistoricallyNamedCharset)
return ((HistoricallyNamedCharset) cs).historicalName();
return cs.name();
}

char[] decode(byte[] ba, int off, int len)
{
int en = scale(len, cd.maxCharsPerByte());
char[] ca = new char[en];
if (len == 0)
return ca;
cd.reset();
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try
{
CoderResult cr = cd.decode(bb, cb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = cd.flush(cb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x)
{
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return trim(ca, cb.position());
}
}
static char[] decode(String charsetName, byte[] ba, int off, int len)
throws UnsupportedEncodingException
{
StringDecoder sd = (StringDecoder) deref(decoder);
String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
if ((sd == null)
|| !(csn.equals(sd.requestedCharsetName()) || csn.equals(sd
.charsetName())))
{
sd = null;
try
{
Charset cs = lookupCharset(csn);
if (cs != null)
sd = new CharsetSD(cs, csn);
else
sd = null;
} catch (IllegalCharsetNameException x)
{
// FALL THROUGH to ByteToCharConverter, for compatibility
}
if (sd == null)
sd = new ConverterSD(ByteToCharConverter.getConverter(csn), csn);
set(decoder, sd);
}
return sd.decode(ba, off, len);
}

static char[] decode(byte[] ba, int off, int len)
{
String csn = Converters.getDefaultEncodingName();
try
{
return decode(csn, ba, off, len);
} catch (UnsupportedEncodingException x)
{
Converters.resetDefaultEncodingName();
warnUnsupportedCharset(csn);
}
try
{
return decode("ISO-8859-1", ba, off, len);
} catch (UnsupportedEncodingException x)
{
// If this code is hit during VM initialization, MessageUtils is
// the only way we will be able to get any kind of error message.
MessageUtils.err("ISO-8859-1 charset not available: "
+ x.toString());
// If we can not find ISO-8859-1 (a required encoding) then things
// are seriously wrong with the installation.
System.exit(1);
return null;
}
}
// -- Encoding --
// Encapsulates either a CharToByteConverter or a CharsetEncoder
//
private static abstract class StringEncoder
{
private final String requestedCharsetName;

protected StringEncoder(String requestedCharsetName)
{
this.requestedCharsetName = requestedCharsetName;
}

final String requestedCharsetName()
{
return requestedCharsetName;
}
abstract String charsetName();
abstract byte[] encode(char[] cs, int off, int len);
}
// A string encoder based upon a CharToByteConverter
//
private static class ConverterSE extends StringEncoder
{
private CharToByteConverter ctb;

private ConverterSE(CharToByteConverter ctb, String rcn)
{
super(rcn);
this.ctb = ctb;
}

String charsetName()
{
return ctb.getCharacterEncoding();
}

byte[] encode(char[] ca, int off, int len)
{
int en = scale(len, ctb.getMaxBytesPerChar());
byte[] ba = new byte[en];
if (len == 0)
return ba;
ctb.reset();
int n;
try
{
n = ctb.convertAny(ca, off, (off + len), ba, 0, en);
n += ctb.flushAny(ba, ctb.nextByteIndex(), en);
} catch (CharConversionException x)
{
throw new Error("Converter malfunction: "
+ ctb.getClass().getName(), x);
}
return trim(ba, n);
}
}
// A string encoder based upon a CharsetEncoder
//
private static class CharsetSE extends StringEncoder
{
private Charset cs;
private CharsetEncoder ce;

private CharsetSE(Charset cs, String rcn)
{
super(rcn);
this.cs = cs;
this.ce = cs.newEncoder().onMalformedInput(
CodingErrorAction.REPLACE).onUnmappableCharacter(
CodingErrorAction.REPLACE);
}

String charsetName()
{
if (cs instanceof HistoricallyNamedCharset)
return ((HistoricallyNamedCharset) cs).historicalName();
return cs.name();
}

byte[] encode(char[] ca, int off, int len)
{
int en = scale(len, ce.maxBytesPerChar());
byte[] ba = new byte[en];
if (len == 0)
return ba;
ce.reset();
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, off, len);
try
{
CoderResult cr = ce.encode(cb, bb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x)
{
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return trim(ba, bb.position());
}
}
static byte[] encode(String charsetName, char[] ca, int off, int len)
throws UnsupportedEncodingException
{
StringEncoder se = (StringEncoder) deref(encoder);
String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
if ((se == null)
|| !(csn.equals(se.requestedCharsetName()) || csn.equals(se
.charsetName())))
{
se = null;
try
{
Charset cs = lookupCharset(csn);
if (cs != null)
se = new CharsetSE(cs, csn);
} catch (IllegalCharsetNameException x)
{
// FALL THROUGH to CharToByteConverter, for compatibility
}
if (se == null)
se = new ConverterSE(CharToByteConverter.getConverter(csn), csn);
set(encoder, se);
}
return se.encode(ca, off, len);
}

static byte[] encode(char[] ca, int off, int len)
{
String csn = Converters.getDefaultEncodingName();
System.out.println(csn);
try
{
return encode(csn, ca, off, len);
} catch (UnsupportedEncodingException x)
{
Converters.resetDefaultEncodingName();
warnUnsupportedCharset(csn);
}
try
{
return encode("ISO-8859-1", ca, off, len);
} catch (UnsupportedEncodingException x)
{
// If this code is hit during VM initialization, MessageUtils is
// the only way we will be able to get any kind of error message.
MessageUtils.err("ISO-8859-1 charset not available: "
+ x.toString());
// If we can not find ISO-8859-1 (a required encoding) then things
// are seriously wrong with the installation.
System.exit(1);
return null;
}
}
}
好测试一下:

public class testCode
{
public static void main(String args[]) throws UnsupportedEncodingException
{
char[] a =
{ '你', '好' };
System.out.println("default:");
byte[] bDef = StringCoding.encode(a, 0, a.length);
for (int i = 0; i < bDef.length; i++)
{
System.out.println(Integer.toBinaryString(bDef[i])
.substring(24, 32));
}
System.out.println("GBK:");
byte[] b = StringCoding.encode("GBK", a, 0, a.length);
for (int i = 0; i < b.length; i++)
{
System.out.println(Integer.toBinaryString(b[i]).substring(24, 32));
}
System.out.println("UTF-8:");
byte[] butf_8 = StringCoding.encode("utf-8", a, 0, a.length);
for (int i = 0; i < butf_8.length; i++)
{
System.out.println(Integer.toBinaryString(butf_8[i]).substring(24,
32));
}
byte[] code = new byte[4];
code[0] = (byte)Integer.parseInt("11000100", 2);
code[1] = (byte)Integer.parseInt("11100011", 2);
code[2] = (byte)Integer.parseInt("10111010", 2);
code[3] = (byte)Integer.parseInt("11000011", 2);
System.out.println(StringCoding.decode("GBK",code,0,code.length));
System.out.println(StringCoding.decode("utf-8",code,0,code.length));
}
}
运行结果如下:
default:
11000100
11100011
10111010
11000011
GBK:
11000100
11100011
10111010
11000011
UTF-8:
11100100
10111101
10100000
11100101
10100101
10111101
你好
???
看出不同了吧,window操作系统默认GBK编码方式,特别代码最后几行,我们把Byte输入,按不同的编码输出,就有不同的结果.想必您已轻知道您的程序为什么出现乱码了吧。

浙公网安备 33010602011771号