提取灵格斯里ld2格式词典的内容
作为一个NLPer,拥有越多文本相关的资源当然越好,词典资源就是其中之一,面对灵格斯里面那么多的词典,怎么提取出其中的内容是个问题,之前在网上搜了一些相关的信息,最终找到一个java代码通过该代码可以直接将灵格斯里.ld2格式的词典内容提取出来,具体的做法应该是根据.ld2的词典格式进行分析,从而得到其中的内容,
感谢“Copyright (c) 2010 Xiaoyun Zhu”和“@author keke”,提供了这么方便的接口!
使用中,我们需要更改final String ld2File = "D:\\kr.ld2";这一条,就可以变为处理自己希望的词典。
/* Copyright (c) 2010 * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.RandomAccessFile; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.CharBuffer; import java.nio.channels.FileChannel; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.zip.Inflater; import java.util.zip.InflaterInputStream; /** * Lingoes LD2/LDF File Reader * * <pre> * Lingoes Format overview: * * General Information: * - Dictionary data are stored in deflate streams. * - Index group information is stored in an index array in the LD2 file itself. * - Numbers are using little endian byte order. * - Definitions and xml data have UTF-8 or UTF-16LE encodings. * * LD2 file schema: * - File Header * - File Description * - Additional Information (optional) * - Index Group (corresponds to definitions in dictionary) * - Deflated Dictionary Streams * -- Index Data * --- Offsets of definitions * --- Offsets of translations * --- Flags * --- References to other translations * -- Definitions * -- Translations (xml) * * TODO: find encoding / language fields to replace auto-detect of encodings * * </pre> * * @author keke * */ public class LingoesLd2Reader { private static final SensitiveStringDecoder[] AVAIL_ENCODINGS = { new SensitiveStringDecoder(Charset.forName("UTF-8")), new SensitiveStringDecoder(Charset.forName("UTF-16LE")), new SensitiveStringDecoder(Charset.forName("UTF-16BE")), new SensitiveStringDecoder(Charset.forName("EUC-JP")) }; public static void main(final String[] args) throws IOException { // download from // https://skydrive.live.com/?cid=a10100d37adc7ad3&sc=documents&id=A10100D37ADC7AD3%211172#cid=A10100D37ADC7AD3&sc=documents // String ld2File = Helper.DIR_IN_DICTS+"\\lingoes\\Prodic English-Vietnamese Business.ld2"; final String ld2File = "D:\\kr.ld2"; // read lingoes ld2 into byte array final ByteBuffer dataRawBytes; try (RandomAccessFile file = new RandomAccessFile(ld2File, "r"); final FileChannel fChannel = file.getChannel();) { dataRawBytes = ByteBuffer.allocate((int) fChannel.size()); fChannel.read(dataRawBytes); } dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); dataRawBytes.rewind(); System.out.println("文件:" + ld2File); System.out.println("类型:" + new String(dataRawBytes.array(), 0, 4, "ASCII")); System.out.println("版本:" + dataRawBytes.getShort(0x18) + "." + dataRawBytes.getShort(0x1A)); System.out.println("ID: 0x" + Long.toHexString(dataRawBytes.getLong(0x1C))); final int offsetData = dataRawBytes.getInt(0x5C) + 0x60; if (dataRawBytes.limit() > offsetData) { System.out.println("简介地址:0x" + Integer.toHexString(offsetData)); final int type = dataRawBytes.getInt(offsetData); System.out.println("简介类型:0x" + Integer.toHexString(type)); final int offsetWithInfo = dataRawBytes.getInt(offsetData + 4) + offsetData + 12; if (type == 3) { // without additional information LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, offsetData); } else if (dataRawBytes.limit() > (offsetWithInfo - 0x1C)) { LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, offsetWithInfo); } else { System.err.println("文件不包含字典数据。网上字典?"); } } else { System.err.println("文件不包含字典数据。网上字典?"); } } private static final long decompress(final String inflatedFile, final ByteBuffer data, final int offset, final int length, final boolean append) throws IOException { final Inflater inflator = new Inflater(); try (final InflaterInputStream in = new InflaterInputStream(new ByteArrayInputStream(data.array(), offset, length), inflator, 1024 * 8); final FileOutputStream out = new FileOutputStream(inflatedFile, append);) { LingoesLd2Reader.writeInputStream(in, out); } final long bytesRead = inflator.getBytesRead(); inflator.end(); return bytesRead; } private static final SensitiveStringDecoder[] detectEncodings(final ByteBuffer inflatedBytes, final int offsetWords, final int offsetXml, final int defTotal, final int dataLen, final int[] idxData, final String[] defData) { final int test = Math.min(defTotal, 10); for (int j = 0; j < LingoesLd2Reader.AVAIL_ENCODINGS.length; j++) { for (int k = 0; k < LingoesLd2Reader.AVAIL_ENCODINGS.length; k++) { try { for (int i = 0; i < test; i++) { LingoesLd2Reader.readDefinitionData(inflatedBytes, offsetWords, offsetXml, dataLen, LingoesLd2Reader.AVAIL_ENCODINGS[j], LingoesLd2Reader.AVAIL_ENCODINGS[k], idxData, defData, i); } System.out.println("词组编码:" + LingoesLd2Reader.AVAIL_ENCODINGS[j].name); System.out.println("XML编码:" + LingoesLd2Reader.AVAIL_ENCODINGS[k].name); return new SensitiveStringDecoder[] { LingoesLd2Reader.AVAIL_ENCODINGS[j], LingoesLd2Reader.AVAIL_ENCODINGS[k] }; } catch (final Throwable e) { // ignore } } } System.err.println("自动识别编码失败!选择UTF-16LE继续。"); return new SensitiveStringDecoder[] { LingoesLd2Reader.AVAIL_ENCODINGS[1], LingoesLd2Reader.AVAIL_ENCODINGS[1] }; } private static final void extract(final String inflatedFile, final String indexFile, final String extractedWordsFile, final String extractedXmlFile, final String extractedOutputFile, final int[] idxArray, final int offsetDefs, final int offsetXml) throws IOException, FileNotFoundException, UnsupportedEncodingException { System.out.println("写入'" + extractedOutputFile + "'。。。"); int counter = 0; try (RandomAccessFile file = new RandomAccessFile(inflatedFile, "r"); final FileWriter indexWriter = new FileWriter(indexFile); final FileWriter defsWriter = new FileWriter(extractedWordsFile); final FileWriter xmlWriter = new FileWriter(extractedXmlFile); final FileWriter outputWriter = new FileWriter(extractedOutputFile); // read inflated data final FileChannel fChannel = file.getChannel();) { final ByteBuffer dataRawBytes = ByteBuffer.allocate((int) fChannel.size()); fChannel.read(dataRawBytes); fChannel.close(); dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); dataRawBytes.rewind(); final int dataLen = 10; final int defTotal = (offsetDefs / dataLen) - 1; final String[] words = new String[defTotal]; final int[] idxData = new int[6]; final String[] defData = new String[2]; final SensitiveStringDecoder[] encodings = LingoesLd2Reader.detectEncodings(dataRawBytes, offsetDefs, offsetXml, defTotal, dataLen, idxData, defData); dataRawBytes.position(8); for (int i = 0; i < defTotal; i++) { LingoesLd2Reader.readDefinitionData(dataRawBytes, offsetDefs, offsetXml, dataLen, encodings[0], encodings[1], idxData, defData, i); words[i] = defData[0]; defsWriter.write(defData[0]); defsWriter.write("\n"); xmlWriter.write(defData[1]); xmlWriter.write("\n"); outputWriter.write(defData[0]); outputWriter.write("="); outputWriter.write(defData[1]); outputWriter.write("\n"); System.out.println(defData[0] + " = " + defData[1]); counter++; } for (int i = 0; i < idxArray.length; i++) { final int idx = idxArray[i]; indexWriter.write(words[idx]); indexWriter.write(", "); indexWriter.write(String.valueOf(idx)); indexWriter.write("\n"); } } System.out.println("成功读出" + counter + "组数据。"); } private static final void getIdxData(final ByteBuffer dataRawBytes, final int position, final int[] wordIdxData) { dataRawBytes.position(position); wordIdxData[0] = dataRawBytes.getInt(); wordIdxData[1] = dataRawBytes.getInt(); wordIdxData[2] = dataRawBytes.get() & 0xff; wordIdxData[3] = dataRawBytes.get() & 0xff; wordIdxData[4] = dataRawBytes.getInt(); wordIdxData[5] = dataRawBytes.getInt(); } private static final void inflate(final ByteBuffer dataRawBytes, final List<Integer> deflateStreams, final String inflatedFile) { System.out.println("解压缩'" + deflateStreams.size() + "'个数据流至'" + inflatedFile + "'。。。"); final int startOffset = dataRawBytes.position(); int offset = -1; int lastOffset = startOffset; boolean append = false; try { for (final Integer offsetRelative : deflateStreams) { offset = startOffset + offsetRelative.intValue(); LingoesLd2Reader.decompress(inflatedFile, dataRawBytes, lastOffset, offset - lastOffset, append); append = true; lastOffset = offset; } } catch (final Throwable e) { System.err.println("解压缩失败: 0x" + Integer.toHexString(offset) + ": " + e.toString()); } } private static final void readDefinitionData(final ByteBuffer inflatedBytes, final int offsetWords, final int offsetXml, final int dataLen, final SensitiveStringDecoder wordStringDecoder, final SensitiveStringDecoder xmlStringDecoder, final int[] idxData, final String[] defData, final int i) { LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * i, idxData); int lastWordPos = idxData[0]; int lastXmlPos = idxData[1]; // final int flags = idxData[2]; int refs = idxData[3]; final int currentWordOffset = idxData[4]; int currenXmlOffset = idxData[5]; String xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos))); while (refs-- > 0) { final int ref = inflatedBytes.getInt(offsetWords + lastWordPos); LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * ref, idxData); lastXmlPos = idxData[1]; currenXmlOffset = idxData[5]; if (xml.isEmpty()) { xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos))); } else { xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos))) + ", " + xml; } lastWordPos += 4; } defData[1] = xml; final String word = new String(wordStringDecoder.decode(inflatedBytes.array(), offsetWords + lastWordPos, currentWordOffset - lastWordPos)); defData[0] = word; } private static final void readDictionary(final String ld2File, final ByteBuffer dataRawBytes, final int offsetWithIndex) throws IOException, FileNotFoundException, UnsupportedEncodingException { System.out.println("词典类型:0x" + Integer.toHexString(dataRawBytes.getInt(offsetWithIndex))); final int limit = dataRawBytes.getInt(offsetWithIndex + 4) + offsetWithIndex + 8; final int offsetIndex = offsetWithIndex + 0x1C; final int offsetCompressedDataHeader = dataRawBytes.getInt(offsetWithIndex + 8) + offsetIndex; final int inflatedWordsIndexLength = dataRawBytes.getInt(offsetWithIndex + 12); final int inflatedWordsLength = dataRawBytes.getInt(offsetWithIndex + 16); final int inflatedXmlLength = dataRawBytes.getInt(offsetWithIndex + 20); final int definitions = (offsetCompressedDataHeader - offsetIndex) / 4; final List<Integer> deflateStreams = new ArrayList<>(); dataRawBytes.position(offsetCompressedDataHeader + 8); int offset = dataRawBytes.getInt(); while ((offset + dataRawBytes.position()) < limit) { offset = dataRawBytes.getInt(); deflateStreams.add(Integer.valueOf(offset)); } final int offsetCompressedData = dataRawBytes.position(); System.out.println("索引词组数目:" + definitions); System.out.println("索引地址/大小:0x" + Integer.toHexString(offsetIndex) + " / " + (offsetCompressedDataHeader - offsetIndex) + " B"); System.out.println("压缩数据地址/大小:0x" + Integer.toHexString(offsetCompressedData) + " / " + (limit - offsetCompressedData) + " B"); System.out.println("词组索引地址/大小(解压缩后):0x0 / " + inflatedWordsIndexLength + " B"); System.out.println("词组地址/大小(解压缩后):0x" + Integer.toHexString(inflatedWordsIndexLength) + " / " + inflatedWordsLength + " B"); System.out.println("XML地址/大小(解压缩后):0x" + Integer.toHexString(inflatedWordsIndexLength + inflatedWordsLength) + " / " + inflatedXmlLength + " B"); System.out.println("文件大小(解压缩后):" + ((inflatedWordsIndexLength + inflatedWordsLength + inflatedXmlLength) / 1024) + " KB"); final String inflatedFile = ld2File + ".inflated"; LingoesLd2Reader.inflate(dataRawBytes, deflateStreams, inflatedFile); if (new File(inflatedFile).isFile()) { final String indexFile = ld2File + ".idx"; final String extractedFile = ld2File + ".words"; final String extractedXmlFile = ld2File + ".xml"; final String extractedOutputFile = ld2File + ".output"; dataRawBytes.position(offsetIndex); final int[] idxArray = new int[definitions]; for (int i = 0; i < definitions; i++) { idxArray[i] = dataRawBytes.getInt(); } LingoesLd2Reader.extract(inflatedFile, indexFile, extractedFile, extractedXmlFile, extractedOutputFile, idxArray, inflatedWordsIndexLength, inflatedWordsIndexLength + inflatedWordsLength); } } private static final String strip(final String xml) { int open = 0; int end = 0; if ((open = xml.indexOf("<![CDATA[")) != -1) { if ((end = xml.indexOf("]]>", open)) != -1) { return xml.substring(open + "<![CDATA[".length(), end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' '); } } else if ((open = xml.indexOf("<Ô")) != -1) { if ((end = xml.indexOf("</Ô", open)) != -1) { open = xml.indexOf(">", open + 1); return xml.substring(open + 1, end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' '); } } else { final StringBuilder sb = new StringBuilder(); end = 0; open = xml.indexOf('<'); do { if ((open - end) > 1) { sb.append(xml.substring(end + 1, open)); } open = xml.indexOf('<', open + 1); end = xml.indexOf('>', end + 1); } while ((open != -1) && (end != -1)); return sb.toString().replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' '); } return ""; } private static final void writeInputStream(final InputStream in, final OutputStream out) throws IOException { final byte[] buffer = new byte[1024 * 8]; int len; while ((len = in.read(buffer)) > 0) { out.write(buffer, 0, len); } } private static class SensitiveStringDecoder { public final String name; private final CharsetDecoder cd; SensitiveStringDecoder(final Charset cs) { this.cd = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT); this.name = cs.name(); } char[] decode(final byte[] ba, final int off, final int len) { final int en = (int) (len * (double) this.cd.maxCharsPerByte()); final char[] ca = new char[en]; if (len == 0) { return ca; } this.cd.reset(); final ByteBuffer bb = ByteBuffer.wrap(ba, off, len); final CharBuffer cb = CharBuffer.wrap(ca); try { CoderResult cr = this.cd.decode(bb, cb, true); if (!cr.isUnderflow()) { cr.throwException(); } cr = this.cd.flush(cb); if (!cr.isUnderflow()) { cr.throwException(); } } catch (final CharacterCodingException x) { // Substitution is always enabled, // so this shouldn't happen throw new Error(x); } return SensitiveStringDecoder.safeTrim(ca, cb.position()); } private static char[] safeTrim(final char[] ca, final int len) { if (len == ca.length) { return ca; } else { return Arrays.copyOf(ca, len); } } } }
以上~
作为一个NLPer,拥有越多文本相关的资源当然越好,词典资源就是其中之一,面对灵格斯里面那么多的词典,怎么提取出其中的内容是个问题,之前在网上搜了一些相关的信息,最终找到一个java代码通过该代码可以直接将灵格斯里.ld2格式的词典内容提取出来,具体的做法应该是根据.ld2的词典格式进行分析,从而得到其中的内容,感谢“Copyright (c) 2010 Xiaoyun Zhu”和“@author keke”,提供了这么方便的接口!使用中,我们需要更改final String ld2File = "D:\\kr.ld2";这一条,就可以变为处理自己希望的词典。/* Copyright (c) 2010 Xiaoyun Zhu * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */
import java.io.ByteArrayInputStream;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.io.RandomAccessFile;import java.io.UnsupportedEncodingException;import java.nio.ByteBuffer;import java.nio.ByteOrder;import java.nio.CharBuffer;import java.nio.channels.FileChannel;import java.nio.charset.CharacterCodingException;import java.nio.charset.Charset;import java.nio.charset.CharsetDecoder;import java.nio.charset.CoderResult;import java.nio.charset.CodingErrorAction;import java.util.ArrayList;import java.util.Arrays;import java.util.List;import java.util.zip.Inflater;import java.util.zip.InflaterInputStream;
/** * Lingoes LD2/LDF File Reader * * <pre> * Lingoes Format overview: * * General Information: * - Dictionary data are stored in deflate streams. * - Index group information is stored in an index array in the LD2 file itself. * - Numbers are using little endian byte order. * - Definitions and xml data have UTF-8 or UTF-16LE encodings. * * LD2 file schema: * - File Header * - File Description * - Additional Information (optional) * - Index Group (corresponds to definitions in dictionary) * - Deflated Dictionary Streams * -- Index Data * --- Offsets of definitions * --- Offsets of translations * --- Flags * --- References to other translations * -- Definitions * -- Translations (xml) * * TODO: find encoding / language fields to replace auto-detect of encodings * * </pre> * * @author keke * */public class LingoesLd2Reader { private static final SensitiveStringDecoder[] AVAIL_ENCODINGS = { new SensitiveStringDecoder(Charset.forName("UTF-8")), new SensitiveStringDecoder(Charset.forName("UTF-16LE")), new SensitiveStringDecoder(Charset.forName("UTF-16BE")), new SensitiveStringDecoder(Charset.forName("EUC-JP")) };
public static void main(final String[] args) throws IOException { // download from // https://skydrive.live.com/?cid=a10100d37adc7ad3&sc=documents&id=A10100D37ADC7AD3%211172#cid=A10100D37ADC7AD3&sc=documents // String ld2File = Helper.DIR_IN_DICTS+"\\lingoes\\Prodic English-Vietnamese Business.ld2"; final String ld2File = "D:\\kr.ld2";
// read lingoes ld2 into byte array final ByteBuffer dataRawBytes; try (RandomAccessFile file = new RandomAccessFile(ld2File, "r"); final FileChannel fChannel = file.getChannel();) { dataRawBytes = ByteBuffer.allocate((int) fChannel.size()); fChannel.read(dataRawBytes); } dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); dataRawBytes.rewind();
System.out.println("文件:" + ld2File); System.out.println("类型:" + new String(dataRawBytes.array(), 0, 4, "ASCII")); System.out.println("版本:" + dataRawBytes.getShort(0x18) + "." + dataRawBytes.getShort(0x1A)); System.out.println("ID: 0x" + Long.toHexString(dataRawBytes.getLong(0x1C)));
final int offsetData = dataRawBytes.getInt(0x5C) + 0x60; if (dataRawBytes.limit() > offsetData) { System.out.println("简介地址:0x" + Integer.toHexString(offsetData)); final int type = dataRawBytes.getInt(offsetData); System.out.println("简介类型:0x" + Integer.toHexString(type)); final int offsetWithInfo = dataRawBytes.getInt(offsetData + 4) + offsetData + 12; if (type == 3) { // without additional information LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, offsetData); } else if (dataRawBytes.limit() > (offsetWithInfo - 0x1C)) { LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, offsetWithInfo); } else { System.err.println("文件不包含字典数据。网上字典?"); } } else { System.err.println("文件不包含字典数据。网上字典?"); } }
private static final long decompress(final String inflatedFile, final ByteBuffer data, final int offset, final int length, final boolean append) throws IOException { final Inflater inflator = new Inflater(); try (final InflaterInputStream in = new InflaterInputStream(new ByteArrayInputStream(data.array(), offset, length), inflator, 1024 * 8); final FileOutputStream out = new FileOutputStream(inflatedFile, append);) { LingoesLd2Reader.writeInputStream(in, out); } final long bytesRead = inflator.getBytesRead(); inflator.end(); return bytesRead; }
private static final SensitiveStringDecoder[] detectEncodings(final ByteBuffer inflatedBytes, final int offsetWords, final int offsetXml, final int defTotal, final int dataLen, final int[] idxData, final String[] defData) { final int test = Math.min(defTotal, 10); for (int j = 0; j < LingoesLd2Reader.AVAIL_ENCODINGS.length; j++) { for (int k = 0; k < LingoesLd2Reader.AVAIL_ENCODINGS.length; k++) { try { for (int i = 0; i < test; i++) { LingoesLd2Reader.readDefinitionData(inflatedBytes, offsetWords, offsetXml, dataLen, LingoesLd2Reader.AVAIL_ENCODINGS[j], LingoesLd2Reader.AVAIL_ENCODINGS[k], idxData, defData, i); } System.out.println("词组编码:" + LingoesLd2Reader.AVAIL_ENCODINGS[j].name); System.out.println("XML编码:" + LingoesLd2Reader.AVAIL_ENCODINGS[k].name); return new SensitiveStringDecoder[] { LingoesLd2Reader.AVAIL_ENCODINGS[j], LingoesLd2Reader.AVAIL_ENCODINGS[k] }; } catch (final Throwable e) { // ignore } } } System.err.println("自动识别编码失败!选择UTF-16LE继续。"); return new SensitiveStringDecoder[] { LingoesLd2Reader.AVAIL_ENCODINGS[1], LingoesLd2Reader.AVAIL_ENCODINGS[1] }; }
private static final void extract(final String inflatedFile, final String indexFile, final String extractedWordsFile, final String extractedXmlFile, final String extractedOutputFile, final int[] idxArray, final int offsetDefs, final int offsetXml) throws IOException, FileNotFoundException, UnsupportedEncodingException { System.out.println("写入'" + extractedOutputFile + "'。。。");
int counter = 0; try (RandomAccessFile file = new RandomAccessFile(inflatedFile, "r"); final FileWriter indexWriter = new FileWriter(indexFile); final FileWriter defsWriter = new FileWriter(extractedWordsFile); final FileWriter xmlWriter = new FileWriter(extractedXmlFile); final FileWriter outputWriter = new FileWriter(extractedOutputFile); // read inflated data final FileChannel fChannel = file.getChannel();) { final ByteBuffer dataRawBytes = ByteBuffer.allocate((int) fChannel.size()); fChannel.read(dataRawBytes); fChannel.close(); dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); dataRawBytes.rewind();
final int dataLen = 10; final int defTotal = (offsetDefs / dataLen) - 1;
final String[] words = new String[defTotal]; final int[] idxData = new int[6]; final String[] defData = new String[2];
final SensitiveStringDecoder[] encodings = LingoesLd2Reader.detectEncodings(dataRawBytes, offsetDefs, offsetXml, defTotal, dataLen, idxData, defData);
dataRawBytes.position(8);
for (int i = 0; i < defTotal; i++) { LingoesLd2Reader.readDefinitionData(dataRawBytes, offsetDefs, offsetXml, dataLen, encodings[0], encodings[1], idxData, defData, i);
words[i] = defData[0]; defsWriter.write(defData[0]); defsWriter.write("\n");
xmlWriter.write(defData[1]); xmlWriter.write("\n");
outputWriter.write(defData[0]); outputWriter.write("="); outputWriter.write(defData[1]); outputWriter.write("\n");
System.out.println(defData[0] + " = " + defData[1]); counter++; }
for (int i = 0; i < idxArray.length; i++) { final int idx = idxArray[i]; indexWriter.write(words[idx]); indexWriter.write(", "); indexWriter.write(String.valueOf(idx)); indexWriter.write("\n"); } } System.out.println("成功读出" + counter + "组数据。"); }
private static final void getIdxData(final ByteBuffer dataRawBytes, final int position, final int[] wordIdxData) { dataRawBytes.position(position); wordIdxData[0] = dataRawBytes.getInt(); wordIdxData[1] = dataRawBytes.getInt(); wordIdxData[2] = dataRawBytes.get() & 0xff; wordIdxData[3] = dataRawBytes.get() & 0xff; wordIdxData[4] = dataRawBytes.getInt(); wordIdxData[5] = dataRawBytes.getInt(); }
private static final void inflate(final ByteBuffer dataRawBytes, final List<Integer> deflateStreams, final String inflatedFile) { System.out.println("解压缩'" + deflateStreams.size() + "'个数据流至'" + inflatedFile + "'。。。"); final int startOffset = dataRawBytes.position(); int offset = -1; int lastOffset = startOffset; boolean append = false; try { for (final Integer offsetRelative : deflateStreams) { offset = startOffset + offsetRelative.intValue(); LingoesLd2Reader.decompress(inflatedFile, dataRawBytes, lastOffset, offset - lastOffset, append); append = true; lastOffset = offset; } } catch (final Throwable e) { System.err.println("解压缩失败: 0x" + Integer.toHexString(offset) + ": " + e.toString()); } }
private static final void readDefinitionData(final ByteBuffer inflatedBytes, final int offsetWords, final int offsetXml, final int dataLen, final SensitiveStringDecoder wordStringDecoder, final SensitiveStringDecoder xmlStringDecoder, final int[] idxData, final String[] defData, final int i) { LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * i, idxData); int lastWordPos = idxData[0]; int lastXmlPos = idxData[1]; // final int flags = idxData[2]; int refs = idxData[3]; final int currentWordOffset = idxData[4]; int currenXmlOffset = idxData[5];
String xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos))); while (refs-- > 0) { final int ref = inflatedBytes.getInt(offsetWords + lastWordPos); LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * ref, idxData); lastXmlPos = idxData[1]; currenXmlOffset = idxData[5]; if (xml.isEmpty()) { xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos))); } else { xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos))) + ", " + xml; } lastWordPos += 4; } defData[1] = xml;
final String word = new String(wordStringDecoder.decode(inflatedBytes.array(), offsetWords + lastWordPos, currentWordOffset - lastWordPos)); defData[0] = word; }
private static final void readDictionary(final String ld2File, final ByteBuffer dataRawBytes, final int offsetWithIndex) throws IOException, FileNotFoundException, UnsupportedEncodingException { System.out.println("词典类型:0x" + Integer.toHexString(dataRawBytes.getInt(offsetWithIndex))); final int limit = dataRawBytes.getInt(offsetWithIndex + 4) + offsetWithIndex + 8; final int offsetIndex = offsetWithIndex + 0x1C; final int offsetCompressedDataHeader = dataRawBytes.getInt(offsetWithIndex + 8) + offsetIndex; final int inflatedWordsIndexLength = dataRawBytes.getInt(offsetWithIndex + 12); final int inflatedWordsLength = dataRawBytes.getInt(offsetWithIndex + 16); final int inflatedXmlLength = dataRawBytes.getInt(offsetWithIndex + 20); final int definitions = (offsetCompressedDataHeader - offsetIndex) / 4; final List<Integer> deflateStreams = new ArrayList<>(); dataRawBytes.position(offsetCompressedDataHeader + 8); int offset = dataRawBytes.getInt(); while ((offset + dataRawBytes.position()) < limit) { offset = dataRawBytes.getInt(); deflateStreams.add(Integer.valueOf(offset)); } final int offsetCompressedData = dataRawBytes.position(); System.out.println("索引词组数目:" + definitions); System.out.println("索引地址/大小:0x" + Integer.toHexString(offsetIndex) + " / " + (offsetCompressedDataHeader - offsetIndex) + " B"); System.out.println("压缩数据地址/大小:0x" + Integer.toHexString(offsetCompressedData) + " / " + (limit - offsetCompressedData) + " B"); System.out.println("词组索引地址/大小(解压缩后):0x0 / " + inflatedWordsIndexLength + " B"); System.out.println("词组地址/大小(解压缩后):0x" + Integer.toHexString(inflatedWordsIndexLength) + " / " + inflatedWordsLength + " B"); System.out.println("XML地址/大小(解压缩后):0x" + Integer.toHexString(inflatedWordsIndexLength + inflatedWordsLength) + " / " + inflatedXmlLength + " B"); System.out.println("文件大小(解压缩后):" + ((inflatedWordsIndexLength + inflatedWordsLength + inflatedXmlLength) / 1024) + " KB"); final String inflatedFile = ld2File + ".inflated"; LingoesLd2Reader.inflate(dataRawBytes, deflateStreams, inflatedFile);
if (new File(inflatedFile).isFile()) { final String indexFile = ld2File + ".idx"; final String extractedFile = ld2File + ".words"; final String extractedXmlFile = ld2File + ".xml"; final String extractedOutputFile = ld2File + ".output";
dataRawBytes.position(offsetIndex); final int[] idxArray = new int[definitions]; for (int i = 0; i < definitions; i++) { idxArray[i] = dataRawBytes.getInt(); } LingoesLd2Reader.extract(inflatedFile, indexFile, extractedFile, extractedXmlFile, extractedOutputFile, idxArray, inflatedWordsIndexLength, inflatedWordsIndexLength + inflatedWordsLength); } }
private static final String strip(final String xml) { int open = 0; int end = 0; if ((open = xml.indexOf("<![CDATA[")) != -1) { if ((end = xml.indexOf("]]>", open)) != -1) { return xml.substring(open + "<![CDATA[".length(), end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' '); } } else if ((open = xml.indexOf("<Ô")) != -1) { if ((end = xml.indexOf("</Ô", open)) != -1) { open = xml.indexOf(">", open + 1); return xml.substring(open + 1, end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' '); } } else { final StringBuilder sb = new StringBuilder(); end = 0; open = xml.indexOf('<'); do { if ((open - end) > 1) { sb.append(xml.substring(end + 1, open)); } open = xml.indexOf('<', open + 1); end = xml.indexOf('>', end + 1); } while ((open != -1) && (end != -1)); return sb.toString().replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' '); } return ""; }
private static final void writeInputStream(final InputStream in, final OutputStream out) throws IOException { final byte[] buffer = new byte[1024 * 8]; int len; while ((len = in.read(buffer)) > 0) { out.write(buffer, 0, len); } }
private static class SensitiveStringDecoder { public final String name; private final CharsetDecoder cd;
SensitiveStringDecoder(final Charset cs) { this.cd = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT); this.name = cs.name(); }
char[] decode(final byte[] ba, final int off, final int len) { final int en = (int) (len * (double) this.cd.maxCharsPerByte()); final char[] ca = new char[en]; if (len == 0) { return ca; } this.cd.reset(); final ByteBuffer bb = ByteBuffer.wrap(ba, off, len); final CharBuffer cb = CharBuffer.wrap(ca); try { CoderResult cr = this.cd.decode(bb, cb, true); if (!cr.isUnderflow()) { cr.throwException(); } cr = this.cd.flush(cb); if (!cr.isUnderflow()) { cr.throwException(); } } catch (final CharacterCodingException x) { // Substitution is always enabled, // so this shouldn't happen throw new Error(x); } return SensitiveStringDecoder.safeTrim(ca, cb.position()); }
private static char[] safeTrim(final char[] ca, final int len) { if (len == ca.length) { return ca; } else { return Arrays.copyOf(ca, len); } } }}123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400以上~————————————————版权声明:本文为CSDN博主「hikaru_go」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。原文链接:https://blog.csdn.net/hikaru_go/java/article/details/69213858