GBK2ASC

  1 /**
  2  * Copyright (C) 2009 The Android Open Source Project
  3  *
  4  * Licensed under the Apache License, Version 2.0 (the "License");
  5  * you may not use this file except in compliance with the License.
  6  * You may obtain a copy of the License at
  7  *
  8  *      http://www.apache.org/licenses/LICENSE-2.0
  9  *
 10  * Unless required by applicable law or agreed to in writing, software
 11  * distributed under the License is distributed on an "AS IS" BASIS,
 12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13  * See the License for the specific language governing permissions and
 14  * limitations under the License.
 15  */
 16 package com.thunder.gbktoasc;
 17 import android.text.TextUtils;
 18 import android.util.Log;
 19 import java.text.Collator;
 20 import java.util.ArrayList;
 21 import java.util.Locale;
 22 /**
 23  * An object to convert Chinese character to its corresponding pinyin string. For characters with
 24  * multiple possible pinyin string, only one is selected according to collator. Polyphone is not
 25  * supported in this implementation. This class is implemented to achieve the best runtime
 26  * performance and minimum runtime resources with tolerable sacrifice of accuracy. This
 27  * implementation highly depends on zh_CN ICU collation data and must be always synchronized with
 28  * ICU.
 29  *
 30  * Currently this file is aligned to zh.txt in ICU 4.6
 31  */
 32 public class HanziToPinyin {
 33     private static final String TAG = "HanziToPinyin";
 34     // Turn on this flag when we want to check internal data structure.
 35     private static final boolean DEBUG = false;
 36     /**
 37      * Unihans array. Each unihans is the first one within same pinyin. Use it to determine pinyin
 38      * for all ~20k unihans.
 39      */
 40     public static final char[] UNIHANS = {
 41             '\u5475', '\u54ce', '\u5b89', '\u80ae', '\u51f9',
 42             '\u516b', '\u6300', '\u6273', '\u90a6', '\u5305', '\u5351', '\u5954', '\u4f3b',
 43             '\u5c44', '\u8fb9', '\u6807', '\u618b', '\u90a0', '\u69df', '\u7676', '\u5cec',
 44             '\u5693', '\u5a47', '\u98e1', '\u4ed3', '\u64cd', '\u518a', '\u5d7e', '\u564c',
 45             '\u53c9', '\u9497', '\u8fbf', '\u4f25', '\u6284', '\u8f66', '\u62bb', '\u67fd',
 46             '\u5403', '\u5145', '\u62bd', '\u51fa', '\u6b3b', '\u63e3', '\u5ddd', '\u75ae',
 47             '\u5439', '\u6776', '\u9034', '\u75b5', '\u5306', '\u51d1', '\u7c97', '\u6c46',
 48             '\u5d14', '\u90a8', '\u6413', '\u5491', '\u5927', '\u75b8', '\u5f53', '\u5200',
 49             '\u6dc2', '\u5f97', '\u6265', '\u706f', '\u6c10', '\u55f2', '\u7538', '\u5201',
 50             '\u7239', '\u4ec3', '\u4e1f', '\u4e1c', '\u5517', '\u561f', '\u5073', '\u5806',
 51             '\u9413', '\u591a', '\u5a40', '\u8bf6', '\u5940', '\u97a5', '\u800c', '\u53d1',
 52             '\u5e06', '\u65b9', '\u98de', '\u5206', '\u4e30', '\u8985', '\u4ecf', '\u7d11',
 53             '\u4f15', '\u65ee', '\u8be5', '\u7518', '\u5188', '\u768b', '\u6208', '\u7d66',
 54             '\u6839', '\u5e9a', '\u5de5', '\u52fe', '\u4f30', '\u74dc', '\u7f6b', '\u5173',
 55             '\u5149', '\u5f52', '\u886e', '\u5459', '\u54c8', '\u54b3', '\u9878', '\u82c0',
 56             '\u84bf', '\u8bc3', '\u9ed2', '\u62eb', '\u4ea8', '\u5677', '\u543d', '\u9f41',
 57             '\u5322', '\u82b1', '\u6000', '\u72bf', '\u5ddf', '\u7070', '\u660f', '\u5419',
 58             '\u4e0c', '\u52a0', '\u620b', '\u6c5f', '\u827d', '\u9636', '\u5dfe', '\u52a4',
 59             '\u5182', '\u52fc', '\u530a', '\u5a1f', '\u5658', '\u519b', '\u5494', '\u5f00',
 60             '\u520a', '\u95f6', '\u5c3b', '\u533c', '\u524b', '\u80af', '\u962c', '\u7a7a',
 61             '\u62a0', '\u5233', '\u5938', '\u84af', '\u5bbd', '\u5321', '\u4e8f', '\u5764',
 62             '\u6269', '\u5783', '\u6765', '\u5170', '\u5577', '\u635e', '\u4ec2', '\u52d2',
 63             '\u5844', '\u5215', '\u5006', '\u5941', '\u826f', '\u64a9', '\u5217', '\u62ce',
 64             '\u3007', '\u6e9c', '\u9f99', '\u779c', '\u565c', '\u5a08', '\u7567', '\u62a1',
 65             '\u7f57', '\u5463', '\u5988', '\u973e', '\u5ada', '\u9099', '\u732b', '\u9ebc',
 66             '\u6c92', '\u95e8', '\u753f', '\u54aa', '\u7720', '\u55b5', '\u54a9', '\u6c11',
 67             '\u540d', '\u8c2c', '\u6478', '\u54de', '\u6bea', '\u62cf', '\u5b7b', '\u56e1',
 68             '\u56ca', '\u5b6c', '\u8bb7', '\u9981', '\u6041', '\u80fd', '\u59ae', '\u62c8',
 69             '\u5b22', '\u9e1f', '\u634f', '\u60a8', '\u5b81', '\u599e', '\u519c', '\u7fba',
 70             '\u5974', '\u597b', '\u8650', '\u632a', '\u5594', '\u8bb4', '\u8db4', '\u62cd',
 71             '\u7705', '\u4e53', '\u629b', '\u5478', '\u55b7', '\u5309', '\u4e15', '\u504f',
 72             '\u527d', '\u6c15', '\u59d8', '\u4e52', '\u948b', '\u5256', '\u4ec6', '\u4e03',
 73             '\u6390', '\u5343', '\u545b', '\u6084', '\u767f', '\u4fb5', '\u9751', '\u909b',
 74             '\u4e18', '\u66f2', '\u5f2e', '\u7f3a', '\u590b', '\u5465', '\u7a63', '\u5a06',
 75             '\u60f9', '\u4eba', '\u6254', '\u65e5', '\u8338', '\u53b9', '\u5982', '\u5827',
 76             '\u6875', '\u95f0', '\u82e5', '\u4ee8', '\u6be2', '\u4e09', '\u6852', '\u63bb',
 77             '\u8272', '\u68ee', '\u50e7', '\u6740', '\u7b5b', '\u5c71', '\u4f24', '\u5f30',
 78             '\u5962', '\u7533', '\u5347', '\u5c38', '\u53ce', '\u4e66', '\u5237', '\u6454',
 79             '\u95e9', '\u53cc', '\u8c01', '\u542e', '\u5981', '\u53b6', '\u5fea', '\u635c',
 80             '\u82cf', '\u72fb', '\u590a', '\u5b59', '\u5506', '\u4ed6', '\u82d4', '\u574d',
 81             '\u94f4', '\u5932', '\u5fd1', '\u71a5', '\u5254', '\u5929', '\u4f7b', '\u5e16',
 82             '\u5385', '\u56f2', '\u5077', '\u92c0', '\u6e4d', '\u63a8', '\u541e', '\u6258',
 83             '\u6316', '\u6b6a', '\u5f2f', '\u5c2a', '\u5371', '\u586d', '\u7fc1', '\u631d',
 84             '\u5140', '\u5915', '\u867e', '\u4eda', '\u4e61', '\u7071', '\u4e9b', '\u5fc3',
 85             '\u661f', '\u51f6', '\u4f11', '\u65f4', '\u8f69', '\u75b6', '\u52cb', '\u4e2b',
 86             '\u6079', '\u592e', '\u5e7a', '\u8036', '\u4e00', '\u6b2d', '\u5e94', '\u54df',
 87             '\u4f63', '\u4f18', '\u625c', '\u9e22', '\u66f0', '\u6655', '\u531d', '\u707d',
 88             '\u7ccc', '\u7242', '\u50ae', '\u5219', '\u8d3c', '\u600e', '\u5897', '\u5412',
 89             '\u635a', '\u6cbe', '\u5f20', '\u948a', '\u8707', '\u8d1e', '\u4e89', '\u4e4b',
 90             '\u4e2d', '\u5dde', '\u6731', '\u6293', '\u8de9', '\u4e13', '\u5986', '\u96b9',
 91             '\u5b92', '\u5353', '\u5b5c', '\u5b97', '\u90b9', '\u79df', '\u94bb', '\u539c',
 92             '\u5c0a', '\u6628', };
 93     /**
 94      * Pinyin array. Each pinyin is corresponding to unihans of same offset in the unihans array.
 95      */
 96     public static final byte[][] PINYINS = {
 97             { 65, 0, 0, 0, 0, 0 }, { 65, 73, 0, 0, 0, 0 }, { 65, 78, 0, 0, 0, 0 },
 98             { 65, 78, 71, 0, 0, 0 }, { 65, 79, 0, 0, 0, 0 }, { 66, 65, 0, 0, 0, 0 },
 99             { 66, 65, 73, 0, 0, 0 }, { 66, 65, 78, 0, 0, 0 }, { 66, 65, 78, 71, 0, 0 },
100             { 66, 65, 79, 0, 0, 0 }, { 66, 69, 73, 0, 0, 0 }, { 66, 69, 78, 0, 0, 0 },
101             { 66, 69, 78, 71, 0, 0 }, { 66, 73, 0, 0, 0, 0 }, { 66, 73, 65, 78, 0, 0 },
102             { 66, 73, 65, 79, 0, 0 }, { 66, 73, 69, 0, 0, 0 }, { 66, 73, 78, 0, 0, 0 },
103             { 66, 73, 78, 71, 0, 0 }, { 66, 79, 0, 0, 0, 0 }, { 66, 85, 0, 0, 0, 0 },
104             { 67, 65, 0, 0, 0, 0 }, { 67, 65, 73, 0, 0, 0 },
105             { 67, 65, 78, 0, 0, 0 }, { 67, 65, 78, 71, 0, 0 }, { 67, 65, 79, 0, 0, 0 },
106             { 67, 69, 0, 0, 0, 0 }, { 67, 69, 78, 0, 0, 0 }, { 67, 69, 78, 71, 0, 0 },
107             { 67, 72, 65, 0, 0, 0 }, { 67, 72, 65, 73, 0, 0 }, { 67, 72, 65, 78, 0, 0 },
108             { 67, 72, 65, 78, 71, 0 }, { 67, 72, 65, 79, 0, 0 }, { 67, 72, 69, 0, 0, 0 },
109             { 67, 72, 69, 78, 0, 0 }, { 67, 72, 69, 78, 71, 0 }, { 67, 72, 73, 0, 0, 0 },
110             { 67, 72, 79, 78, 71, 0 }, { 67, 72, 79, 85, 0, 0 }, { 67, 72, 85, 0, 0, 0 },
111             { 67, 72, 85, 65, 0, 0 }, { 67, 72, 85, 65, 73, 0 }, { 67, 72, 85, 65, 78, 0 },
112             { 67, 72, 85, 65, 78, 71 }, { 67, 72, 85, 73, 0, 0 }, { 67, 72, 85, 78, 0, 0 },
113             { 67, 72, 85, 79, 0, 0 }, { 67, 73, 0, 0, 0, 0 }, { 67, 79, 78, 71, 0, 0 },
114             { 67, 79, 85, 0, 0, 0 }, { 67, 85, 0, 0, 0, 0 }, { 67, 85, 65, 78, 0, 0 },
115             { 67, 85, 73, 0, 0, 0 }, { 67, 85, 78, 0, 0, 0 }, { 67, 85, 79, 0, 0, 0 },
116             { 68, 65, 0, 0, 0, 0 }, { 68, 65, 73, 0, 0, 0 }, { 68, 65, 78, 0, 0, 0 },
117             { 68, 65, 78, 71, 0, 0 }, { 68, 65, 79, 0, 0, 0 }, { 68, 69, 0, 0, 0, 0 },
118             { 68, 69, 73, 0, 0, 0 }, { 68, 69, 78, 0, 0, 0 }, { 68, 69, 78, 71, 0, 0 },
119             { 68, 73, 0, 0, 0, 0 }, { 68, 73, 65, 0, 0, 0 }, { 68, 73, 65, 78, 0, 0 },
120             { 68, 73, 65, 79, 0, 0 }, { 68, 73, 69, 0, 0, 0 }, { 68, 73, 78, 71, 0, 0 },
121             { 68, 73, 85, 0, 0, 0 }, { 68, 79, 78, 71, 0, 0 }, { 68, 79, 85, 0, 0, 0 },
122             { 68, 85, 0, 0, 0, 0 }, { 68, 85, 65, 78, 0, 0 }, { 68, 85, 73, 0, 0, 0 },
123             { 68, 85, 78, 0, 0, 0 }, { 68, 85, 79, 0, 0, 0 }, { 69, 0, 0, 0, 0, 0 },
124             { 69, 73, 0, 0, 0, 0 }, { 69, 78, 0, 0, 0, 0 }, { 69, 78, 71, 0, 0, 0 },
125             { 69, 82, 0, 0, 0, 0 }, { 70, 65, 0, 0, 0, 0 }, { 70, 65, 78, 0, 0, 0 },
126             { 70, 65, 78, 71, 0, 0 }, { 70, 69, 73, 0, 0, 0 }, { 70, 69, 78, 0, 0, 0 },
127             { 70, 69, 78, 71, 0, 0 }, { 70, 73, 65, 79, 0, 0 }, { 70, 79, 0, 0, 0, 0 },
128             { 70, 79, 85, 0, 0, 0 }, { 70, 85, 0, 0, 0, 0 }, { 71, 65, 0, 0, 0, 0 },
129             { 71, 65, 73, 0, 0, 0 }, { 71, 65, 78, 0, 0, 0 }, { 71, 65, 78, 71, 0, 0 },
130             { 71, 65, 79, 0, 0, 0 }, { 71, 69, 0, 0, 0, 0 }, { 71, 69, 73, 0, 0, 0 },
131             { 71, 69, 78, 0, 0, 0 }, { 71, 69, 78, 71, 0, 0 }, { 71, 79, 78, 71, 0, 0 },
132             { 71, 79, 85, 0, 0, 0 }, { 71, 85, 0, 0, 0, 0 }, { 71, 85, 65, 0, 0, 0 },
133             { 71, 85, 65, 73, 0, 0 }, { 71, 85, 65, 78, 0, 0 }, { 71, 85, 65, 78, 71, 0 },
134             { 71, 85, 73, 0, 0, 0 }, { 71, 85, 78, 0, 0, 0 }, { 71, 85, 79, 0, 0, 0 },
135             { 72, 65, 0, 0, 0, 0 }, { 72, 65, 73, 0, 0, 0 }, { 72, 65, 78, 0, 0, 0 },
136             { 72, 65, 78, 71, 0, 0 }, { 72, 65, 79, 0, 0, 0 }, { 72, 69, 0, 0, 0, 0 },
137             { 72, 69, 73, 0, 0, 0 }, { 72, 69, 78, 0, 0, 0 }, { 72, 69, 78, 71, 0, 0 },
138             { 72, 77, 0, 0, 0, 0 }, { 72, 79, 78, 71, 0, 0 }, { 72, 79, 85, 0, 0, 0 },
139             { 72, 85, 0, 0, 0, 0 }, { 72, 85, 65, 0, 0, 0 }, { 72, 85, 65, 73, 0, 0 },
140             { 72, 85, 65, 78, 0, 0 }, { 72, 85, 65, 78, 71, 0 }, { 72, 85, 73, 0, 0, 0 },
141             { 72, 85, 78, 0, 0, 0 }, { 72, 85, 79, 0, 0, 0 }, { 74, 73, 0, 0, 0, 0 },
142             { 74, 73, 65, 0, 0, 0 }, { 74, 73, 65, 78, 0, 0 }, { 74, 73, 65, 78, 71, 0 },
143             { 74, 73, 65, 79, 0, 0 }, { 74, 73, 69, 0, 0, 0 }, { 74, 73, 78, 0, 0, 0 },
144             { 74, 73, 78, 71, 0, 0 }, { 74, 73, 79, 78, 71, 0 }, { 74, 73, 85, 0, 0, 0 },
145             { 74, 85, 0, 0, 0, 0 }, { 74, 85, 65, 78, 0, 0 }, { 74, 85, 69, 0, 0, 0 },
146             { 74, 85, 78, 0, 0, 0 }, { 75, 65, 0, 0, 0, 0 }, { 75, 65, 73, 0, 0, 0 },
147             { 75, 65, 78, 0, 0, 0 }, { 75, 65, 78, 71, 0, 0 }, { 75, 65, 79, 0, 0, 0 },
148             { 75, 69, 0, 0, 0, 0 }, { 75, 69, 73, 0, 0, 0 }, { 75, 69, 78, 0, 0, 0 },
149             { 75, 69, 78, 71, 0, 0 }, { 75, 79, 78, 71, 0, 0 }, { 75, 79, 85, 0, 0, 0 },
150             { 75, 85, 0, 0, 0, 0 }, { 75, 85, 65, 0, 0, 0 }, { 75, 85, 65, 73, 0, 0 },
151             { 75, 85, 65, 78, 0, 0 }, { 75, 85, 65, 78, 71, 0 }, { 75, 85, 73, 0, 0, 0 },
152             { 75, 85, 78, 0, 0, 0 }, { 75, 85, 79, 0, 0, 0 }, { 76, 65, 0, 0, 0, 0 },
153             { 76, 65, 73, 0, 0, 0 }, { 76, 65, 78, 0, 0, 0 }, { 76, 65, 78, 71, 0, 0 },
154             { 76, 65, 79, 0, 0, 0 }, { 76, 69, 0, 0, 0, 0 }, { 76, 69, 73, 0, 0, 0 },
155             { 76, 69, 78, 71, 0, 0 }, { 76, 73, 0, 0, 0, 0 }, { 76, 73, 65, 0, 0, 0 },
156             { 76, 73, 65, 78, 0, 0 }, { 76, 73, 65, 78, 71, 0 }, { 76, 73, 65, 79, 0, 0 },
157             { 76, 73, 69, 0, 0, 0 }, { 76, 73, 78, 0, 0, 0 }, { 76, 73, 78, 71, 0, 0 },
158             { 76, 73, 85, 0, 0, 0 }, { 76, 79, 78, 71, 0, 0 }, { 76, 79, 85, 0, 0, 0 },
159             { 76, 85, 0, 0, 0, 0 }, { 76, 85, 65, 78, 0, 0 }, { 76, 85, 69, 0, 0, 0 },
160             { 76, 85, 78, 0, 0, 0 }, { 76, 85, 79, 0, 0, 0 }, { 77, 0, 0, 0, 0, 0 },
161             { 77, 65, 0, 0, 0, 0 }, { 77, 65, 73, 0, 0, 0 }, { 77, 65, 78, 0, 0, 0 },
162             { 77, 65, 78, 71, 0, 0 }, { 77, 65, 79, 0, 0, 0 }, { 77, 69, 0, 0, 0, 0 },
163             { 77, 69, 73, 0, 0, 0 }, { 77, 69, 78, 0, 0, 0 }, { 77, 69, 78, 71, 0, 0 },
164             { 77, 73, 0, 0, 0, 0 }, { 77, 73, 65, 78, 0, 0 }, { 77, 73, 65, 79, 0, 0 },
165             { 77, 73, 69, 0, 0, 0 }, { 77, 73, 78, 0, 0, 0 }, { 77, 73, 78, 71, 0, 0 },
166             { 77, 73, 85, 0, 0, 0 }, { 77, 79, 0, 0, 0, 0 }, { 77, 79, 85, 0, 0, 0 },
167             { 77, 85, 0, 0, 0, 0 }, { 78, 65, 0, 0, 0, 0 }, { 78, 65, 73, 0, 0, 0 },
168             { 78, 65, 78, 0, 0, 0 }, { 78, 65, 78, 71, 0, 0 }, { 78, 65, 79, 0, 0, 0 },
169             { 78, 69, 0, 0, 0, 0 }, { 78, 69, 73, 0, 0, 0 }, { 78, 69, 78, 0, 0, 0 },
170             { 78, 69, 78, 71, 0, 0 }, { 78, 73, 0, 0, 0, 0 }, { 78, 73, 65, 78, 0, 0 },
171             { 78, 73, 65, 78, 71, 0 }, { 78, 73, 65, 79, 0, 0 }, { 78, 73, 69, 0, 0, 0 },
172             { 78, 73, 78, 0, 0, 0 }, { 78, 73, 78, 71, 0, 0 }, { 78, 73, 85, 0, 0, 0 },
173             { 78, 79, 78, 71, 0, 0 }, { 78, 79, 85, 0, 0, 0 }, { 78, 85, 0, 0, 0, 0 },
174             { 78, 85, 65, 78, 0, 0 }, { 78, 85, 69, 0, 0, 0 }, { 78, 85, 79, 0, 0, 0 },
175             { 79, 0, 0, 0, 0, 0 }, { 79, 85, 0, 0, 0, 0 }, { 80, 65, 0, 0, 0, 0 },
176             { 80, 65, 73, 0, 0, 0 }, { 80, 65, 78, 0, 0, 0 }, { 80, 65, 78, 71, 0, 0 },
177             { 80, 65, 79, 0, 0, 0 }, { 80, 69, 73, 0, 0, 0 }, { 80, 69, 78, 0, 0, 0 },
178             { 80, 69, 78, 71, 0, 0 }, { 80, 73, 0, 0, 0, 0 }, { 80, 73, 65, 78, 0, 0 },
179             { 80, 73, 65, 79, 0, 0 }, { 80, 73, 69, 0, 0, 0 }, { 80, 73, 78, 0, 0, 0 },
180             { 80, 73, 78, 71, 0, 0 }, { 80, 79, 0, 0, 0, 0 }, { 80, 79, 85, 0, 0, 0 },
181             { 80, 85, 0, 0, 0, 0 }, { 81, 73, 0, 0, 0, 0 }, { 81, 73, 65, 0, 0, 0 },
182             { 81, 73, 65, 78, 0, 0 }, { 81, 73, 65, 78, 71, 0 }, { 81, 73, 65, 79, 0, 0 },
183             { 81, 73, 69, 0, 0, 0 }, { 81, 73, 78, 0, 0, 0 }, { 81, 73, 78, 71, 0, 0 },
184             { 81, 73, 79, 78, 71, 0 }, { 81, 73, 85, 0, 0, 0 }, { 81, 85, 0, 0, 0, 0 },
185             { 81, 85, 65, 78, 0, 0 }, { 81, 85, 69, 0, 0, 0 }, { 81, 85, 78, 0, 0, 0 },
186             { 82, 65, 78, 0, 0, 0 }, { 82, 65, 78, 71, 0, 0 }, { 82, 65, 79, 0, 0, 0 },
187             { 82, 69, 0, 0, 0, 0 }, { 82, 69, 78, 0, 0, 0 }, { 82, 69, 78, 71, 0, 0 },
188             { 82, 73, 0, 0, 0, 0 }, { 82, 79, 78, 71, 0, 0 }, { 82, 79, 85, 0, 0, 0 },
189             { 82, 85, 0, 0, 0, 0 }, { 82, 85, 65, 78, 0, 0 }, { 82, 85, 73, 0, 0, 0 },
190             { 82, 85, 78, 0, 0, 0 }, { 82, 85, 79, 0, 0, 0 }, { 83, 65, 0, 0, 0, 0 },
191             { 83, 65, 73, 0, 0, 0 }, { 83, 65, 78, 0, 0, 0 }, { 83, 65, 78, 71, 0, 0 },
192             { 83, 65, 79, 0, 0, 0 }, { 83, 69, 0, 0, 0, 0 }, { 83, 69, 78, 0, 0, 0 },
193             { 83, 69, 78, 71, 0, 0 }, { 83, 72, 65, 0, 0, 0 }, { 83, 72, 65, 73, 0, 0 },
194             { 83, 72, 65, 78, 0, 0 }, { 83, 72, 65, 78, 71, 0 }, { 83, 72, 65, 79, 0, 0 },
195             { 83, 72, 69, 0, 0, 0 }, { 83, 72, 69, 78, 0, 0 }, { 83, 72, 69, 78, 71, 0 },
196             { 83, 72, 73, 0, 0, 0 }, { 83, 72, 79, 85, 0, 0 }, { 83, 72, 85, 0, 0, 0 },
197             { 83, 72, 85, 65, 0, 0 }, { 83, 72, 85, 65, 73, 0 }, { 83, 72, 85, 65, 78, 0 },
198             { 83, 72, 85, 65, 78, 71 }, { 83, 72, 85, 73, 0, 0 }, { 83, 72, 85, 78, 0, 0 },
199             { 83, 72, 85, 79, 0, 0 }, { 83, 73, 0, 0, 0, 0 }, { 83, 79, 78, 71, 0, 0 },
200             { 83, 79, 85, 0, 0, 0 }, { 83, 85, 0, 0, 0, 0 }, { 83, 85, 65, 78, 0, 0 },
201             { 83, 85, 73, 0, 0, 0 }, { 83, 85, 78, 0, 0, 0 }, { 83, 85, 79, 0, 0, 0 },
202             { 84, 65, 0, 0, 0, 0 }, { 84, 65, 73, 0, 0, 0 }, { 84, 65, 78, 0, 0, 0 },
203             { 84, 65, 78, 71, 0, 0 }, { 84, 65, 79, 0, 0, 0 }, { 84, 69, 0, 0, 0, 0 },
204             { 84, 69, 78, 71, 0, 0 }, { 84, 73, 0, 0, 0, 0 }, { 84, 73, 65, 78, 0, 0 },
205             { 84, 73, 65, 79, 0, 0 }, { 84, 73, 69, 0, 0, 0 }, { 84, 73, 78, 71, 0, 0 },
206             { 84, 79, 78, 71, 0, 0 }, { 84, 79, 85, 0, 0, 0 }, { 84, 85, 0, 0, 0, 0 },
207             { 84, 85, 65, 78, 0, 0 }, { 84, 85, 73, 0, 0, 0 }, { 84, 85, 78, 0, 0, 0 },
208             { 84, 85, 79, 0, 0, 0 }, { 87, 65, 0, 0, 0, 0 }, { 87, 65, 73, 0, 0, 0 },
209             { 87, 65, 78, 0, 0, 0 }, { 87, 65, 78, 71, 0, 0 }, { 87, 69, 73, 0, 0, 0 },
210             { 87, 69, 78, 0, 0, 0 }, { 87, 69, 78, 71, 0, 0 }, { 87, 79, 0, 0, 0, 0 },
211             { 87, 85, 0, 0, 0, 0 }, { 88, 73, 0, 0, 0, 0 }, { 88, 73, 65, 0, 0, 0 },
212             { 88, 73, 65, 78, 0, 0 }, { 88, 73, 65, 78, 71, 0 }, { 88, 73, 65, 79, 0, 0 },
213             { 88, 73, 69, 0, 0, 0 }, { 88, 73, 78, 0, 0, 0 }, { 88, 73, 78, 71, 0, 0 },
214             { 88, 73, 79, 78, 71, 0 }, { 88, 73, 85, 0, 0, 0 }, { 88, 85, 0, 0, 0, 0 },
215             { 88, 85, 65, 78, 0, 0 }, { 88, 85, 69, 0, 0, 0 }, { 88, 85, 78, 0, 0, 0 },
216             { 89, 65, 0, 0, 0, 0 }, { 89, 65, 78, 0, 0, 0 }, { 89, 65, 78, 71, 0, 0 },
217             { 89, 65, 79, 0, 0, 0 }, { 89, 69, 0, 0, 0, 0 }, { 89, 73, 0, 0, 0, 0 },
218             { 89, 73, 78, 0, 0, 0 }, { 89, 73, 78, 71, 0, 0 }, { 89, 79, 0, 0, 0, 0 },
219             { 89, 79, 78, 71, 0, 0 }, { 89, 79, 85, 0, 0, 0 }, { 89, 85, 0, 0, 0, 0 },
220             { 89, 85, 65, 78, 0, 0 }, { 89, 85, 69, 0, 0, 0 }, { 89, 85, 78, 0, 0, 0 },
221             { 90, 65, 0, 0, 0, 0 }, { 90, 65, 73, 0, 0, 0 }, { 90, 65, 78, 0, 0, 0 },
222             { 90, 65, 78, 71, 0, 0 }, { 90, 65, 79, 0, 0, 0 }, { 90, 69, 0, 0, 0, 0 },
223             { 90, 69, 73, 0, 0, 0 }, { 90, 69, 78, 0, 0, 0 }, { 90, 69, 78, 71, 0, 0 },
224             { 90, 72, 65, 0, 0, 0 }, { 90, 72, 65, 73, 0, 0 }, { 90, 72, 65, 78, 0, 0 },
225             { 90, 72, 65, 78, 71, 0 }, { 90, 72, 65, 79, 0, 0 }, { 90, 72, 69, 0, 0, 0 },
226             { 90, 72, 69, 78, 0, 0 }, { 90, 72, 69, 78, 71, 0 }, { 90, 72, 73, 0, 0, 0 },
227             { 90, 72, 79, 78, 71, 0 }, { 90, 72, 79, 85, 0, 0 }, { 90, 72, 85, 0, 0, 0 },
228             { 90, 72, 85, 65, 0, 0 }, { 90, 72, 85, 65, 73, 0 }, { 90, 72, 85, 65, 78, 0 },
229             { 90, 72, 85, 65, 78, 71 }, { 90, 72, 85, 73, 0, 0 }, { 90, 72, 85, 78, 0, 0 },
230             { 90, 72, 85, 79, 0, 0 }, { 90, 73, 0, 0, 0, 0 }, { 90, 79, 78, 71, 0, 0 },
231             { 90, 79, 85, 0, 0, 0 }, { 90, 85, 0, 0, 0, 0 }, { 90, 85, 65, 78, 0, 0 },
232             { 90, 85, 73, 0, 0, 0 }, { 90, 85, 78, 0, 0, 0 }, { 90, 85, 79, 0, 0, 0 }, };
233     /** First and last Chinese character with known Pinyin according to zh collation */
234     private static final String FIRST_PINYIN_UNIHAN = "\u963F";
235     private static final String LAST_PINYIN_UNIHAN = "\u84D9";
236     /** The first Chinese character in Unicode block */
237     private static final char FIRST_UNIHAN = '\u3400';
238     private static final Collator COLLATOR = Collator.getInstance(Locale.CHINA);
239     private static HanziToPinyin sInstance;
240     private final boolean mHasChinaCollator;
241     public static class Token {
242         /**
243          * Separator between target string for each source char
244          */
245         public static final String SEPARATOR = " ";
246         public static final int LATIN = 1;
247         public static final int PINYIN = 2;
248         public static final int UNKNOWN = 3;
249         public Token() {
250         }
251         public Token(int type, String source, String target) {
252             this.type = type;
253             this.source = source;
254             this.target = target;
255         }
256         /**
257          * Type of this token, ASCII, PINYIN or UNKNOWN.
258          */
259         public int type;
260         /**
261          * Original string before translation.
262          */
263         public String source;
264         /**
265          * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is
266          * original string in source.
267          */
268         public String target;
269     }
270     protected HanziToPinyin(boolean hasChinaCollator) {
271         mHasChinaCollator = hasChinaCollator;
272     }
273     public static HanziToPinyin getInstance() {
274         synchronized (HanziToPinyin.class) {
275             if (sInstance != null) {
276                 return sInstance;
277             }
278             // Check if zh_CN collation data is available
279             final Locale locale[] = Collator.getAvailableLocales();
280             for (int i = 0; i < locale.length; i++) {
281                 if (locale[i].equals(Locale.CHINA)) {
282                     // Do self validation just once.
283                     if (DEBUG) {
284                         Log.d(TAG, "Self validation. Result: " + doSelfValidation());
285                     }
286                     sInstance = new HanziToPinyin(true);
287                     return sInstance;
288                 }
289             }
290             Log.w(TAG, "There is no Chinese collator, HanziToPinyin is disabled");
291             sInstance = new HanziToPinyin(false);
292             return sInstance;
293         }
294     }
295     /**
296      * Validate if our internal table has some wrong value.
297      *
298      * @return true when the table looks correct.
299      */
300     private static boolean doSelfValidation() {
301         char lastChar = UNIHANS[0];
302         String lastString = Character.toString(lastChar);
303         for (char c : UNIHANS) {
304             if (lastChar == c) {
305                 continue;
306             }
307             final String curString = Character.toString(c);
308             int cmp = COLLATOR.compare(lastString, curString);
309             if (cmp >= 0) {
310                 Log.e(TAG, "Internal error in Unihan table. " + "The last string \"" + lastString
311                         + "\" is greater than current string \"" + curString + "\".");
312                 return false;
313             }
314             lastString = curString;
315         }
316         return true;
317     }
318     private Token getToken(char character) {
319         Token token = new Token();
320         final String letter = Character.toString(character);
321         token.source = letter;
322         int offset = -1;
323         int cmp;
324         if (character < 256) {
325             token.type = Token.LATIN;
326             token.target = letter;
327             return token;
328         } else if (character < FIRST_UNIHAN) {
329             token.type = Token.UNKNOWN;
330             token.target = letter;
331             return token;
332         } else {
333             cmp = COLLATOR.compare(letter, FIRST_PINYIN_UNIHAN);
334             if (cmp < 0) {
335                 token.type = Token.UNKNOWN;
336                 token.target = letter;
337                 return token;
338             } else if (cmp == 0) {
339                 token.type = Token.PINYIN;
340                 offset = 0;
341             } else {
342                 cmp = COLLATOR.compare(letter, LAST_PINYIN_UNIHAN);
343                 if (cmp > 0) {
344                     token.type = Token.UNKNOWN;
345                     token.target = letter;
346                     return token;
347                 } else if (cmp == 0) {
348                     token.type = Token.PINYIN;
349                     offset = UNIHANS.length - 1;
350                 }
351             }
352         }
353         token.type = Token.PINYIN;
354         if (offset < 0) {
355             int begin = 0;
356             int end = UNIHANS.length - 1;
357             while (begin <= end) {
358                 offset = (begin + end) / 2;
359                 final String unihan = Character.toString(UNIHANS[offset]);
360                 cmp = COLLATOR.compare(letter, unihan);
361                 if (cmp == 0) {
362                     break;
363                 } else if (cmp > 0) {
364                     begin = offset + 1;
365                 } else {
366                     end = offset - 1;
367                 }
368             }
369         }
370         if (cmp < 0) {
371             offset--;
372         }
373         StringBuilder pinyin = new StringBuilder();
374         for (int j = 0; j < PINYINS[offset].length && PINYINS[offset][j] != 0; j++) {
375             pinyin.append((char) PINYINS[offset][j]);
376         }
377         token.target = pinyin.toString();
378         return token;
379     }
380     /**
381      * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without
382      * space will be put into a Token, One Hanzi character which has pinyin will be treated as a
383      * Token. If these is no China collator, the empty token array is returned.
384      */
385     public ArrayList<Token> get(final String input) {
386         ArrayList<Token> tokens = new ArrayList<Token>();
387         if (!mHasChinaCollator || TextUtils.isEmpty(input)) {
388             // return empty tokens.
389             return tokens;
390         }
391         final int inputLength = input.length();
392         final StringBuilder sb = new StringBuilder();
393         int tokenType = Token.LATIN;
394         // Go through the input, create a new token when
395         // a. Token type changed
396         // b. Get the Pinyin of current charater.
397         // c. current character is space.
398         for (int i = 0; i < inputLength; i++) {
399             final char character = input.charAt(i);
400             if (character == ' ') {
401                 if (sb.length() > 0) {
402                     addToken(sb, tokens, tokenType);
403                 }
404             } else if (character < 256) {
405                 if (tokenType != Token.LATIN && sb.length() > 0) {
406                     addToken(sb, tokens, tokenType);
407                 }
408                 tokenType = Token.LATIN;
409                 sb.append(character);
410             } else if (character < FIRST_UNIHAN) {
411                 if (tokenType != Token.UNKNOWN && sb.length() > 0) {
412                     addToken(sb, tokens, tokenType);
413                 }
414                 tokenType = Token.UNKNOWN;
415                 sb.append(character);
416             } else {
417                 Token t = getToken(character);
418                 if (t.type == Token.PINYIN) {
419                     if (sb.length() > 0) {
420                         addToken(sb, tokens, tokenType);
421                     }
422                     tokens.add(t);
423                     tokenType = Token.PINYIN;
424                 } else {
425                     if (tokenType != t.type && sb.length() > 0) {
426                         addToken(sb, tokens, tokenType);
427                     }
428                     tokenType = t.type;
429                     sb.append(character);
430                 }
431             }
432         }
433         if (sb.length() > 0) {
434             addToken(sb, tokens, tokenType);
435         }
436         return tokens;
437     }
438     private void addToken(
439             final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) {
440         String str = sb.toString();
441         tokens.add(new Token(tokenType, str, str));
442         sb.setLength(0);
443     }
444    
445     //The fillowing lines are provided and maintained by Mediatek inc.
446     private class DialerSearchToken extends Token {
447      static final int FIRSTCASE = 0;
448      static final int UPPERCASE = 1;
449      static final int LOWERCASE = 2;
450     }
451     
452     public String getTokensForDialerSearch(final String input, StringBuilder offsets){
453         
454         if(offsets == null || input == null || TextUtils.isEmpty(input)){
455          // return empty tokens
456          return null;
457         }
458         
459      StringBuilder subStrSet = new StringBuilder();
460         ArrayList<Token> tokens = new ArrayList<Token>();
461         ArrayList<String> shortSubStrOffset = new ArrayList<String>();
462         final int inputLength = input.length();
463         final StringBuilder subString = new StringBuilder();
464         final StringBuilder subStrOffset = new StringBuilder();
465         int tokenType = Token.LATIN;
466         int caseTypePre = DialerSearchToken.FIRSTCASE;
467         int caseTypeCurr = DialerSearchToken.UPPERCASE;
468         int mPos = 0;
469         
470         // Go through the input, create a new token when
471         // a. Token type changed
472         // b. Get the Pinyin of current charater.
473         // c. current character is space.
474         // d. Token case changed from lower case to upper case, 
475         // e. the first character is always a separated one
476         // f character == '+' || character == '#' || character == '*' || character == ',' || character == ';'
477         for (int i = 0; i < inputLength; i++) {
478             final char character = input.charAt(i);
479             if (character == '-' || character == ',' ){
480              mPos++;
481             } else if (character == ' ') {
482                 if (subString.length() > 0) {
483                     addToken(subString, tokens, tokenType);
484                     addOffsets(subStrOffset, shortSubStrOffset);
485                 }
486                 addSubString(tokens,shortSubStrOffset,subStrSet,offsets);
487              mPos++;
488                 caseTypePre = DialerSearchToken.FIRSTCASE;
489             } else if (character < 256) {
490                 if (tokenType != Token.LATIN && subString.length() > 0) {
491                     addToken(subString, tokens, tokenType);
492                     addOffsets(subStrOffset, shortSubStrOffset);
493                  }
494                 caseTypeCurr = (character>='A' && character<='Z')?DialerSearchToken.UPPERCASE:DialerSearchToken.LOWERCASE;
495                 if(caseTypePre == DialerSearchToken.LOWERCASE && caseTypeCurr == DialerSearchToken.UPPERCASE){
496                  addToken(subString, tokens, tokenType);
497                  addOffsets(subStrOffset, shortSubStrOffset);
498                 }
499                 caseTypePre = caseTypeCurr; 
500                 tokenType = Token.LATIN;
501                 Character c = Character.toUpperCase(character);
502                 if(c != null){
503                  subString.append(c);
504                  subStrOffset.append((char)mPos);
505                 }
506                 mPos++;
507             } else if (character < FIRST_UNIHAN) {
508                   //Comment out. Do not cover unknown characters SINCE they can not be input.
509 //                if (tokenType != Token.UNKNOWN && subString.length() > 0) {
510 //                    addToken(subString, tokens, tokenType);
511 //                    addOffsets(subStrOffset, shortSubStrOffset);
512 //                    caseTypePre = Token.FIRSTCASE;
513 //                }
514 //                tokenType = Token.UNKNOWN;
515 //                Character c = Character.toUpperCase(character);
516 //                if(c != null){
517 //                 subString.append(c);
518 //                 subStrOffset.append((char)(mPos));
519 //                }
520                 mPos++;
521             } else {
522              Token t = getToken(character);
523                 int tokenSize = t.target.length();
524                 //Current type is PINYIN
525                 if (t.type == Token.PINYIN) {
526                     if (subString.length() > 0) {
527                         addToken(subString, tokens, tokenType);
528                         addOffsets(subStrOffset, shortSubStrOffset);
529                     }
530                     tokens.add(t);
531                     for(int j=0; j < tokenSize;j++)
532                      subStrOffset.append((char)mPos);
533                     addOffsets(subStrOffset,shortSubStrOffset);
534                     tokenType = Token.PINYIN;
535                     caseTypePre = DialerSearchToken.FIRSTCASE;
536                     mPos++;
537                 } else {
538                  //Comment out. Do not cover special characters SINCE they can not be input.
539 //                    if (tokenType != t.type && subString.length() > 0) {
540 //                        addToken(subString, tokens, tokenType);
541 //                        addOffsets(subStrOffset, shortSubStrOffset);
542 //                        caseTypePre = Token.FIRSTCASE;
543 //                    }else{
544 //                     caseTypeCurr = (character>='A' && character<='Z')?Token.UPPERCASE:Token.LOWERCASE;
545 //                     if(caseTypePre == Token.LOWERCASE && caseTypeCurr == Token.UPPERCASE){
546 //                      addToken(subString, tokens, tokenType);
547 //                      addOffsets(subStrOffset, shortSubStrOffset);
548 //                     }
549 //                     caseTypePre = caseTypeCurr; 
550 //                    }
551 //                    tokenType = t.type;
552 //                    Character c = Character.toUpperCase(character);
553 //                    if(c != null){
554 //                     subString.append(c);
555 //                     subStrOffset.append(mPos);
556 //                    }
557                     mPos++;
558                 }
559             }
560             //IF the name string is too long, cut it off to meet the storage request of dialer search.
561             if(mPos > 127)
562              break;
563         }
564         if (subString.length() > 0) {
565             addToken(subString, tokens, tokenType);
566             addOffsets(subStrOffset, shortSubStrOffset);
567         }
568         addSubString(tokens,shortSubStrOffset,subStrSet,offsets);
569         return subStrSet.toString();
570     }
571     
572     private void addOffsets(final StringBuilder sb, final ArrayList<String> shortSubStrOffset){
573      String str = sb.toString();
574      shortSubStrOffset.add(str);
575      sb.setLength(0);
576     }
577     
578     private void addSubString(final ArrayList<Token> tokens, final ArrayList<String> shortSubStrOffset,
579           StringBuilder subStrSet, StringBuilder offsets){
580      if(tokens == null || tokens.isEmpty())
581       return;
582      
583      int size = tokens.size();
584      int len = 0;
585      StringBuilder mShortSubStr = new StringBuilder();
586      StringBuilder mShortSubStrOffsets = new StringBuilder();
587      StringBuilder mShortSubStrSet = new StringBuilder();
588      StringBuilder mShortSubStrOffsetsSet = new StringBuilder();
589      
590      for(int i=size-1; i>=0 ; i--){
591       String mTempStr = tokens.get(i).target;
592       len += mTempStr.length();
593       String mTempOffset = shortSubStrOffset.get(i);
594       if(mShortSubStr.length()>0){
595        mShortSubStr.deleteCharAt(0);
596        mShortSubStrOffsets.deleteCharAt(0);
597       }
598       mShortSubStr.insert(0, mTempStr);
599       mShortSubStr.insert(0,(char)len);
600       mShortSubStrOffsets.insert(0,mTempOffset);
601       mShortSubStrOffsets.insert(0,(char)len);
602       mShortSubStrSet.insert(0,mShortSubStr);
603       mShortSubStrOffsetsSet.insert(0, mShortSubStrOffsets);
604      }
605      
606      subStrSet.append(mShortSubStrSet);
607      offsets.append(mShortSubStrOffsetsSet);
608      tokens.clear();
609      shortSubStrOffset.clear();
610     }
611     //The previous lines are provided and maintained by Mediatek inc.    
612 }

调用 input是输入的汉字

 public static String getPinYin2(String input) {  
        ArrayList<Token> tokens = HanziToPinyin.getInstance().get(input);  
        StringBuilder sb = new StringBuilder();  
        if (tokens != null && tokens.size() > 0) {  
            for (Token token : tokens) {  
                if (Token.PINYIN == token.type) {  
                    sb.append(token.target);  
                } else {  
                    sb.append(token.source);  
                }  
            }  
        }  
        return sb.toString().toUpperCase();  
    } 

 

posted on 2013-02-19 10:46  _star  阅读(180)  评论(0)    收藏  举报

导航