GBK2ASC
1 /** 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.thunder.gbktoasc; 17 import android.text.TextUtils; 18 import android.util.Log; 19 import java.text.Collator; 20 import java.util.ArrayList; 21 import java.util.Locale; 22 /** 23 * An object to convert Chinese character to its corresponding pinyin string. For characters with 24 * multiple possible pinyin string, only one is selected according to collator. Polyphone is not 25 * supported in this implementation. This class is implemented to achieve the best runtime 26 * performance and minimum runtime resources with tolerable sacrifice of accuracy. This 27 * implementation highly depends on zh_CN ICU collation data and must be always synchronized with 28 * ICU. 29 * 30 * Currently this file is aligned to zh.txt in ICU 4.6 31 */ 32 public class HanziToPinyin { 33 private static final String TAG = "HanziToPinyin"; 34 // Turn on this flag when we want to check internal data structure. 35 private static final boolean DEBUG = false; 36 /** 37 * Unihans array. Each unihans is the first one within same pinyin. Use it to determine pinyin 38 * for all ~20k unihans. 39 */ 40 public static final char[] UNIHANS = { 41 '\u5475', '\u54ce', '\u5b89', '\u80ae', '\u51f9', 42 '\u516b', '\u6300', '\u6273', '\u90a6', '\u5305', '\u5351', '\u5954', '\u4f3b', 43 '\u5c44', '\u8fb9', '\u6807', '\u618b', '\u90a0', '\u69df', '\u7676', '\u5cec', 44 '\u5693', '\u5a47', '\u98e1', '\u4ed3', '\u64cd', '\u518a', '\u5d7e', '\u564c', 45 '\u53c9', '\u9497', '\u8fbf', '\u4f25', '\u6284', '\u8f66', '\u62bb', '\u67fd', 46 '\u5403', '\u5145', '\u62bd', '\u51fa', '\u6b3b', '\u63e3', '\u5ddd', '\u75ae', 47 '\u5439', '\u6776', '\u9034', '\u75b5', '\u5306', '\u51d1', '\u7c97', '\u6c46', 48 '\u5d14', '\u90a8', '\u6413', '\u5491', '\u5927', '\u75b8', '\u5f53', '\u5200', 49 '\u6dc2', '\u5f97', '\u6265', '\u706f', '\u6c10', '\u55f2', '\u7538', '\u5201', 50 '\u7239', '\u4ec3', '\u4e1f', '\u4e1c', '\u5517', '\u561f', '\u5073', '\u5806', 51 '\u9413', '\u591a', '\u5a40', '\u8bf6', '\u5940', '\u97a5', '\u800c', '\u53d1', 52 '\u5e06', '\u65b9', '\u98de', '\u5206', '\u4e30', '\u8985', '\u4ecf', '\u7d11', 53 '\u4f15', '\u65ee', '\u8be5', '\u7518', '\u5188', '\u768b', '\u6208', '\u7d66', 54 '\u6839', '\u5e9a', '\u5de5', '\u52fe', '\u4f30', '\u74dc', '\u7f6b', '\u5173', 55 '\u5149', '\u5f52', '\u886e', '\u5459', '\u54c8', '\u54b3', '\u9878', '\u82c0', 56 '\u84bf', '\u8bc3', '\u9ed2', '\u62eb', '\u4ea8', '\u5677', '\u543d', '\u9f41', 57 '\u5322', '\u82b1', '\u6000', '\u72bf', '\u5ddf', '\u7070', '\u660f', '\u5419', 58 '\u4e0c', '\u52a0', '\u620b', '\u6c5f', '\u827d', '\u9636', '\u5dfe', '\u52a4', 59 '\u5182', '\u52fc', '\u530a', '\u5a1f', '\u5658', '\u519b', '\u5494', '\u5f00', 60 '\u520a', '\u95f6', '\u5c3b', '\u533c', '\u524b', '\u80af', '\u962c', '\u7a7a', 61 '\u62a0', '\u5233', '\u5938', '\u84af', '\u5bbd', '\u5321', '\u4e8f', '\u5764', 62 '\u6269', '\u5783', '\u6765', '\u5170', '\u5577', '\u635e', '\u4ec2', '\u52d2', 63 '\u5844', '\u5215', '\u5006', '\u5941', '\u826f', '\u64a9', '\u5217', '\u62ce', 64 '\u3007', '\u6e9c', '\u9f99', '\u779c', '\u565c', '\u5a08', '\u7567', '\u62a1', 65 '\u7f57', '\u5463', '\u5988', '\u973e', '\u5ada', '\u9099', '\u732b', '\u9ebc', 66 '\u6c92', '\u95e8', '\u753f', '\u54aa', '\u7720', '\u55b5', '\u54a9', '\u6c11', 67 '\u540d', '\u8c2c', '\u6478', '\u54de', '\u6bea', '\u62cf', '\u5b7b', '\u56e1', 68 '\u56ca', '\u5b6c', '\u8bb7', '\u9981', '\u6041', '\u80fd', '\u59ae', '\u62c8', 69 '\u5b22', '\u9e1f', '\u634f', '\u60a8', '\u5b81', '\u599e', '\u519c', '\u7fba', 70 '\u5974', '\u597b', '\u8650', '\u632a', '\u5594', '\u8bb4', '\u8db4', '\u62cd', 71 '\u7705', '\u4e53', '\u629b', '\u5478', '\u55b7', '\u5309', '\u4e15', '\u504f', 72 '\u527d', '\u6c15', '\u59d8', '\u4e52', '\u948b', '\u5256', '\u4ec6', '\u4e03', 73 '\u6390', '\u5343', '\u545b', '\u6084', '\u767f', '\u4fb5', '\u9751', '\u909b', 74 '\u4e18', '\u66f2', '\u5f2e', '\u7f3a', '\u590b', '\u5465', '\u7a63', '\u5a06', 75 '\u60f9', '\u4eba', '\u6254', '\u65e5', '\u8338', '\u53b9', '\u5982', '\u5827', 76 '\u6875', '\u95f0', '\u82e5', '\u4ee8', '\u6be2', '\u4e09', '\u6852', '\u63bb', 77 '\u8272', '\u68ee', '\u50e7', '\u6740', '\u7b5b', '\u5c71', '\u4f24', '\u5f30', 78 '\u5962', '\u7533', '\u5347', '\u5c38', '\u53ce', '\u4e66', '\u5237', '\u6454', 79 '\u95e9', '\u53cc', '\u8c01', '\u542e', '\u5981', '\u53b6', '\u5fea', '\u635c', 80 '\u82cf', '\u72fb', '\u590a', '\u5b59', '\u5506', '\u4ed6', '\u82d4', '\u574d', 81 '\u94f4', '\u5932', '\u5fd1', '\u71a5', '\u5254', '\u5929', '\u4f7b', '\u5e16', 82 '\u5385', '\u56f2', '\u5077', '\u92c0', '\u6e4d', '\u63a8', '\u541e', '\u6258', 83 '\u6316', '\u6b6a', '\u5f2f', '\u5c2a', '\u5371', '\u586d', '\u7fc1', '\u631d', 84 '\u5140', '\u5915', '\u867e', '\u4eda', '\u4e61', '\u7071', '\u4e9b', '\u5fc3', 85 '\u661f', '\u51f6', '\u4f11', '\u65f4', '\u8f69', '\u75b6', '\u52cb', '\u4e2b', 86 '\u6079', '\u592e', '\u5e7a', '\u8036', '\u4e00', '\u6b2d', '\u5e94', '\u54df', 87 '\u4f63', '\u4f18', '\u625c', '\u9e22', '\u66f0', '\u6655', '\u531d', '\u707d', 88 '\u7ccc', '\u7242', '\u50ae', '\u5219', '\u8d3c', '\u600e', '\u5897', '\u5412', 89 '\u635a', '\u6cbe', '\u5f20', '\u948a', '\u8707', '\u8d1e', '\u4e89', '\u4e4b', 90 '\u4e2d', '\u5dde', '\u6731', '\u6293', '\u8de9', '\u4e13', '\u5986', '\u96b9', 91 '\u5b92', '\u5353', '\u5b5c', '\u5b97', '\u90b9', '\u79df', '\u94bb', '\u539c', 92 '\u5c0a', '\u6628', }; 93 /** 94 * Pinyin array. Each pinyin is corresponding to unihans of same offset in the unihans array. 95 */ 96 public static final byte[][] PINYINS = { 97 { 65, 0, 0, 0, 0, 0 }, { 65, 73, 0, 0, 0, 0 }, { 65, 78, 0, 0, 0, 0 }, 98 { 65, 78, 71, 0, 0, 0 }, { 65, 79, 0, 0, 0, 0 }, { 66, 65, 0, 0, 0, 0 }, 99 { 66, 65, 73, 0, 0, 0 }, { 66, 65, 78, 0, 0, 0 }, { 66, 65, 78, 71, 0, 0 }, 100 { 66, 65, 79, 0, 0, 0 }, { 66, 69, 73, 0, 0, 0 }, { 66, 69, 78, 0, 0, 0 }, 101 { 66, 69, 78, 71, 0, 0 }, { 66, 73, 0, 0, 0, 0 }, { 66, 73, 65, 78, 0, 0 }, 102 { 66, 73, 65, 79, 0, 0 }, { 66, 73, 69, 0, 0, 0 }, { 66, 73, 78, 0, 0, 0 }, 103 { 66, 73, 78, 71, 0, 0 }, { 66, 79, 0, 0, 0, 0 }, { 66, 85, 0, 0, 0, 0 }, 104 { 67, 65, 0, 0, 0, 0 }, { 67, 65, 73, 0, 0, 0 }, 105 { 67, 65, 78, 0, 0, 0 }, { 67, 65, 78, 71, 0, 0 }, { 67, 65, 79, 0, 0, 0 }, 106 { 67, 69, 0, 0, 0, 0 }, { 67, 69, 78, 0, 0, 0 }, { 67, 69, 78, 71, 0, 0 }, 107 { 67, 72, 65, 0, 0, 0 }, { 67, 72, 65, 73, 0, 0 }, { 67, 72, 65, 78, 0, 0 }, 108 { 67, 72, 65, 78, 71, 0 }, { 67, 72, 65, 79, 0, 0 }, { 67, 72, 69, 0, 0, 0 }, 109 { 67, 72, 69, 78, 0, 0 }, { 67, 72, 69, 78, 71, 0 }, { 67, 72, 73, 0, 0, 0 }, 110 { 67, 72, 79, 78, 71, 0 }, { 67, 72, 79, 85, 0, 0 }, { 67, 72, 85, 0, 0, 0 }, 111 { 67, 72, 85, 65, 0, 0 }, { 67, 72, 85, 65, 73, 0 }, { 67, 72, 85, 65, 78, 0 }, 112 { 67, 72, 85, 65, 78, 71 }, { 67, 72, 85, 73, 0, 0 }, { 67, 72, 85, 78, 0, 0 }, 113 { 67, 72, 85, 79, 0, 0 }, { 67, 73, 0, 0, 0, 0 }, { 67, 79, 78, 71, 0, 0 }, 114 { 67, 79, 85, 0, 0, 0 }, { 67, 85, 0, 0, 0, 0 }, { 67, 85, 65, 78, 0, 0 }, 115 { 67, 85, 73, 0, 0, 0 }, { 67, 85, 78, 0, 0, 0 }, { 67, 85, 79, 0, 0, 0 }, 116 { 68, 65, 0, 0, 0, 0 }, { 68, 65, 73, 0, 0, 0 }, { 68, 65, 78, 0, 0, 0 }, 117 { 68, 65, 78, 71, 0, 0 }, { 68, 65, 79, 0, 0, 0 }, { 68, 69, 0, 0, 0, 0 }, 118 { 68, 69, 73, 0, 0, 0 }, { 68, 69, 78, 0, 0, 0 }, { 68, 69, 78, 71, 0, 0 }, 119 { 68, 73, 0, 0, 0, 0 }, { 68, 73, 65, 0, 0, 0 }, { 68, 73, 65, 78, 0, 0 }, 120 { 68, 73, 65, 79, 0, 0 }, { 68, 73, 69, 0, 0, 0 }, { 68, 73, 78, 71, 0, 0 }, 121 { 68, 73, 85, 0, 0, 0 }, { 68, 79, 78, 71, 0, 0 }, { 68, 79, 85, 0, 0, 0 }, 122 { 68, 85, 0, 0, 0, 0 }, { 68, 85, 65, 78, 0, 0 }, { 68, 85, 73, 0, 0, 0 }, 123 { 68, 85, 78, 0, 0, 0 }, { 68, 85, 79, 0, 0, 0 }, { 69, 0, 0, 0, 0, 0 }, 124 { 69, 73, 0, 0, 0, 0 }, { 69, 78, 0, 0, 0, 0 }, { 69, 78, 71, 0, 0, 0 }, 125 { 69, 82, 0, 0, 0, 0 }, { 70, 65, 0, 0, 0, 0 }, { 70, 65, 78, 0, 0, 0 }, 126 { 70, 65, 78, 71, 0, 0 }, { 70, 69, 73, 0, 0, 0 }, { 70, 69, 78, 0, 0, 0 }, 127 { 70, 69, 78, 71, 0, 0 }, { 70, 73, 65, 79, 0, 0 }, { 70, 79, 0, 0, 0, 0 }, 128 { 70, 79, 85, 0, 0, 0 }, { 70, 85, 0, 0, 0, 0 }, { 71, 65, 0, 0, 0, 0 }, 129 { 71, 65, 73, 0, 0, 0 }, { 71, 65, 78, 0, 0, 0 }, { 71, 65, 78, 71, 0, 0 }, 130 { 71, 65, 79, 0, 0, 0 }, { 71, 69, 0, 0, 0, 0 }, { 71, 69, 73, 0, 0, 0 }, 131 { 71, 69, 78, 0, 0, 0 }, { 71, 69, 78, 71, 0, 0 }, { 71, 79, 78, 71, 0, 0 }, 132 { 71, 79, 85, 0, 0, 0 }, { 71, 85, 0, 0, 0, 0 }, { 71, 85, 65, 0, 0, 0 }, 133 { 71, 85, 65, 73, 0, 0 }, { 71, 85, 65, 78, 0, 0 }, { 71, 85, 65, 78, 71, 0 }, 134 { 71, 85, 73, 0, 0, 0 }, { 71, 85, 78, 0, 0, 0 }, { 71, 85, 79, 0, 0, 0 }, 135 { 72, 65, 0, 0, 0, 0 }, { 72, 65, 73, 0, 0, 0 }, { 72, 65, 78, 0, 0, 0 }, 136 { 72, 65, 78, 71, 0, 0 }, { 72, 65, 79, 0, 0, 0 }, { 72, 69, 0, 0, 0, 0 }, 137 { 72, 69, 73, 0, 0, 0 }, { 72, 69, 78, 0, 0, 0 }, { 72, 69, 78, 71, 0, 0 }, 138 { 72, 77, 0, 0, 0, 0 }, { 72, 79, 78, 71, 0, 0 }, { 72, 79, 85, 0, 0, 0 }, 139 { 72, 85, 0, 0, 0, 0 }, { 72, 85, 65, 0, 0, 0 }, { 72, 85, 65, 73, 0, 0 }, 140 { 72, 85, 65, 78, 0, 0 }, { 72, 85, 65, 78, 71, 0 }, { 72, 85, 73, 0, 0, 0 }, 141 { 72, 85, 78, 0, 0, 0 }, { 72, 85, 79, 0, 0, 0 }, { 74, 73, 0, 0, 0, 0 }, 142 { 74, 73, 65, 0, 0, 0 }, { 74, 73, 65, 78, 0, 0 }, { 74, 73, 65, 78, 71, 0 }, 143 { 74, 73, 65, 79, 0, 0 }, { 74, 73, 69, 0, 0, 0 }, { 74, 73, 78, 0, 0, 0 }, 144 { 74, 73, 78, 71, 0, 0 }, { 74, 73, 79, 78, 71, 0 }, { 74, 73, 85, 0, 0, 0 }, 145 { 74, 85, 0, 0, 0, 0 }, { 74, 85, 65, 78, 0, 0 }, { 74, 85, 69, 0, 0, 0 }, 146 { 74, 85, 78, 0, 0, 0 }, { 75, 65, 0, 0, 0, 0 }, { 75, 65, 73, 0, 0, 0 }, 147 { 75, 65, 78, 0, 0, 0 }, { 75, 65, 78, 71, 0, 0 }, { 75, 65, 79, 0, 0, 0 }, 148 { 75, 69, 0, 0, 0, 0 }, { 75, 69, 73, 0, 0, 0 }, { 75, 69, 78, 0, 0, 0 }, 149 { 75, 69, 78, 71, 0, 0 }, { 75, 79, 78, 71, 0, 0 }, { 75, 79, 85, 0, 0, 0 }, 150 { 75, 85, 0, 0, 0, 0 }, { 75, 85, 65, 0, 0, 0 }, { 75, 85, 65, 73, 0, 0 }, 151 { 75, 85, 65, 78, 0, 0 }, { 75, 85, 65, 78, 71, 0 }, { 75, 85, 73, 0, 0, 0 }, 152 { 75, 85, 78, 0, 0, 0 }, { 75, 85, 79, 0, 0, 0 }, { 76, 65, 0, 0, 0, 0 }, 153 { 76, 65, 73, 0, 0, 0 }, { 76, 65, 78, 0, 0, 0 }, { 76, 65, 78, 71, 0, 0 }, 154 { 76, 65, 79, 0, 0, 0 }, { 76, 69, 0, 0, 0, 0 }, { 76, 69, 73, 0, 0, 0 }, 155 { 76, 69, 78, 71, 0, 0 }, { 76, 73, 0, 0, 0, 0 }, { 76, 73, 65, 0, 0, 0 }, 156 { 76, 73, 65, 78, 0, 0 }, { 76, 73, 65, 78, 71, 0 }, { 76, 73, 65, 79, 0, 0 }, 157 { 76, 73, 69, 0, 0, 0 }, { 76, 73, 78, 0, 0, 0 }, { 76, 73, 78, 71, 0, 0 }, 158 { 76, 73, 85, 0, 0, 0 }, { 76, 79, 78, 71, 0, 0 }, { 76, 79, 85, 0, 0, 0 }, 159 { 76, 85, 0, 0, 0, 0 }, { 76, 85, 65, 78, 0, 0 }, { 76, 85, 69, 0, 0, 0 }, 160 { 76, 85, 78, 0, 0, 0 }, { 76, 85, 79, 0, 0, 0 }, { 77, 0, 0, 0, 0, 0 }, 161 { 77, 65, 0, 0, 0, 0 }, { 77, 65, 73, 0, 0, 0 }, { 77, 65, 78, 0, 0, 0 }, 162 { 77, 65, 78, 71, 0, 0 }, { 77, 65, 79, 0, 0, 0 }, { 77, 69, 0, 0, 0, 0 }, 163 { 77, 69, 73, 0, 0, 0 }, { 77, 69, 78, 0, 0, 0 }, { 77, 69, 78, 71, 0, 0 }, 164 { 77, 73, 0, 0, 0, 0 }, { 77, 73, 65, 78, 0, 0 }, { 77, 73, 65, 79, 0, 0 }, 165 { 77, 73, 69, 0, 0, 0 }, { 77, 73, 78, 0, 0, 0 }, { 77, 73, 78, 71, 0, 0 }, 166 { 77, 73, 85, 0, 0, 0 }, { 77, 79, 0, 0, 0, 0 }, { 77, 79, 85, 0, 0, 0 }, 167 { 77, 85, 0, 0, 0, 0 }, { 78, 65, 0, 0, 0, 0 }, { 78, 65, 73, 0, 0, 0 }, 168 { 78, 65, 78, 0, 0, 0 }, { 78, 65, 78, 71, 0, 0 }, { 78, 65, 79, 0, 0, 0 }, 169 { 78, 69, 0, 0, 0, 0 }, { 78, 69, 73, 0, 0, 0 }, { 78, 69, 78, 0, 0, 0 }, 170 { 78, 69, 78, 71, 0, 0 }, { 78, 73, 0, 0, 0, 0 }, { 78, 73, 65, 78, 0, 0 }, 171 { 78, 73, 65, 78, 71, 0 }, { 78, 73, 65, 79, 0, 0 }, { 78, 73, 69, 0, 0, 0 }, 172 { 78, 73, 78, 0, 0, 0 }, { 78, 73, 78, 71, 0, 0 }, { 78, 73, 85, 0, 0, 0 }, 173 { 78, 79, 78, 71, 0, 0 }, { 78, 79, 85, 0, 0, 0 }, { 78, 85, 0, 0, 0, 0 }, 174 { 78, 85, 65, 78, 0, 0 }, { 78, 85, 69, 0, 0, 0 }, { 78, 85, 79, 0, 0, 0 }, 175 { 79, 0, 0, 0, 0, 0 }, { 79, 85, 0, 0, 0, 0 }, { 80, 65, 0, 0, 0, 0 }, 176 { 80, 65, 73, 0, 0, 0 }, { 80, 65, 78, 0, 0, 0 }, { 80, 65, 78, 71, 0, 0 }, 177 { 80, 65, 79, 0, 0, 0 }, { 80, 69, 73, 0, 0, 0 }, { 80, 69, 78, 0, 0, 0 }, 178 { 80, 69, 78, 71, 0, 0 }, { 80, 73, 0, 0, 0, 0 }, { 80, 73, 65, 78, 0, 0 }, 179 { 80, 73, 65, 79, 0, 0 }, { 80, 73, 69, 0, 0, 0 }, { 80, 73, 78, 0, 0, 0 }, 180 { 80, 73, 78, 71, 0, 0 }, { 80, 79, 0, 0, 0, 0 }, { 80, 79, 85, 0, 0, 0 }, 181 { 80, 85, 0, 0, 0, 0 }, { 81, 73, 0, 0, 0, 0 }, { 81, 73, 65, 0, 0, 0 }, 182 { 81, 73, 65, 78, 0, 0 }, { 81, 73, 65, 78, 71, 0 }, { 81, 73, 65, 79, 0, 0 }, 183 { 81, 73, 69, 0, 0, 0 }, { 81, 73, 78, 0, 0, 0 }, { 81, 73, 78, 71, 0, 0 }, 184 { 81, 73, 79, 78, 71, 0 }, { 81, 73, 85, 0, 0, 0 }, { 81, 85, 0, 0, 0, 0 }, 185 { 81, 85, 65, 78, 0, 0 }, { 81, 85, 69, 0, 0, 0 }, { 81, 85, 78, 0, 0, 0 }, 186 { 82, 65, 78, 0, 0, 0 }, { 82, 65, 78, 71, 0, 0 }, { 82, 65, 79, 0, 0, 0 }, 187 { 82, 69, 0, 0, 0, 0 }, { 82, 69, 78, 0, 0, 0 }, { 82, 69, 78, 71, 0, 0 }, 188 { 82, 73, 0, 0, 0, 0 }, { 82, 79, 78, 71, 0, 0 }, { 82, 79, 85, 0, 0, 0 }, 189 { 82, 85, 0, 0, 0, 0 }, { 82, 85, 65, 78, 0, 0 }, { 82, 85, 73, 0, 0, 0 }, 190 { 82, 85, 78, 0, 0, 0 }, { 82, 85, 79, 0, 0, 0 }, { 83, 65, 0, 0, 0, 0 }, 191 { 83, 65, 73, 0, 0, 0 }, { 83, 65, 78, 0, 0, 0 }, { 83, 65, 78, 71, 0, 0 }, 192 { 83, 65, 79, 0, 0, 0 }, { 83, 69, 0, 0, 0, 0 }, { 83, 69, 78, 0, 0, 0 }, 193 { 83, 69, 78, 71, 0, 0 }, { 83, 72, 65, 0, 0, 0 }, { 83, 72, 65, 73, 0, 0 }, 194 { 83, 72, 65, 78, 0, 0 }, { 83, 72, 65, 78, 71, 0 }, { 83, 72, 65, 79, 0, 0 }, 195 { 83, 72, 69, 0, 0, 0 }, { 83, 72, 69, 78, 0, 0 }, { 83, 72, 69, 78, 71, 0 }, 196 { 83, 72, 73, 0, 0, 0 }, { 83, 72, 79, 85, 0, 0 }, { 83, 72, 85, 0, 0, 0 }, 197 { 83, 72, 85, 65, 0, 0 }, { 83, 72, 85, 65, 73, 0 }, { 83, 72, 85, 65, 78, 0 }, 198 { 83, 72, 85, 65, 78, 71 }, { 83, 72, 85, 73, 0, 0 }, { 83, 72, 85, 78, 0, 0 }, 199 { 83, 72, 85, 79, 0, 0 }, { 83, 73, 0, 0, 0, 0 }, { 83, 79, 78, 71, 0, 0 }, 200 { 83, 79, 85, 0, 0, 0 }, { 83, 85, 0, 0, 0, 0 }, { 83, 85, 65, 78, 0, 0 }, 201 { 83, 85, 73, 0, 0, 0 }, { 83, 85, 78, 0, 0, 0 }, { 83, 85, 79, 0, 0, 0 }, 202 { 84, 65, 0, 0, 0, 0 }, { 84, 65, 73, 0, 0, 0 }, { 84, 65, 78, 0, 0, 0 }, 203 { 84, 65, 78, 71, 0, 0 }, { 84, 65, 79, 0, 0, 0 }, { 84, 69, 0, 0, 0, 0 }, 204 { 84, 69, 78, 71, 0, 0 }, { 84, 73, 0, 0, 0, 0 }, { 84, 73, 65, 78, 0, 0 }, 205 { 84, 73, 65, 79, 0, 0 }, { 84, 73, 69, 0, 0, 0 }, { 84, 73, 78, 71, 0, 0 }, 206 { 84, 79, 78, 71, 0, 0 }, { 84, 79, 85, 0, 0, 0 }, { 84, 85, 0, 0, 0, 0 }, 207 { 84, 85, 65, 78, 0, 0 }, { 84, 85, 73, 0, 0, 0 }, { 84, 85, 78, 0, 0, 0 }, 208 { 84, 85, 79, 0, 0, 0 }, { 87, 65, 0, 0, 0, 0 }, { 87, 65, 73, 0, 0, 0 }, 209 { 87, 65, 78, 0, 0, 0 }, { 87, 65, 78, 71, 0, 0 }, { 87, 69, 73, 0, 0, 0 }, 210 { 87, 69, 78, 0, 0, 0 }, { 87, 69, 78, 71, 0, 0 }, { 87, 79, 0, 0, 0, 0 }, 211 { 87, 85, 0, 0, 0, 0 }, { 88, 73, 0, 0, 0, 0 }, { 88, 73, 65, 0, 0, 0 }, 212 { 88, 73, 65, 78, 0, 0 }, { 88, 73, 65, 78, 71, 0 }, { 88, 73, 65, 79, 0, 0 }, 213 { 88, 73, 69, 0, 0, 0 }, { 88, 73, 78, 0, 0, 0 }, { 88, 73, 78, 71, 0, 0 }, 214 { 88, 73, 79, 78, 71, 0 }, { 88, 73, 85, 0, 0, 0 }, { 88, 85, 0, 0, 0, 0 }, 215 { 88, 85, 65, 78, 0, 0 }, { 88, 85, 69, 0, 0, 0 }, { 88, 85, 78, 0, 0, 0 }, 216 { 89, 65, 0, 0, 0, 0 }, { 89, 65, 78, 0, 0, 0 }, { 89, 65, 78, 71, 0, 0 }, 217 { 89, 65, 79, 0, 0, 0 }, { 89, 69, 0, 0, 0, 0 }, { 89, 73, 0, 0, 0, 0 }, 218 { 89, 73, 78, 0, 0, 0 }, { 89, 73, 78, 71, 0, 0 }, { 89, 79, 0, 0, 0, 0 }, 219 { 89, 79, 78, 71, 0, 0 }, { 89, 79, 85, 0, 0, 0 }, { 89, 85, 0, 0, 0, 0 }, 220 { 89, 85, 65, 78, 0, 0 }, { 89, 85, 69, 0, 0, 0 }, { 89, 85, 78, 0, 0, 0 }, 221 { 90, 65, 0, 0, 0, 0 }, { 90, 65, 73, 0, 0, 0 }, { 90, 65, 78, 0, 0, 0 }, 222 { 90, 65, 78, 71, 0, 0 }, { 90, 65, 79, 0, 0, 0 }, { 90, 69, 0, 0, 0, 0 }, 223 { 90, 69, 73, 0, 0, 0 }, { 90, 69, 78, 0, 0, 0 }, { 90, 69, 78, 71, 0, 0 }, 224 { 90, 72, 65, 0, 0, 0 }, { 90, 72, 65, 73, 0, 0 }, { 90, 72, 65, 78, 0, 0 }, 225 { 90, 72, 65, 78, 71, 0 }, { 90, 72, 65, 79, 0, 0 }, { 90, 72, 69, 0, 0, 0 }, 226 { 90, 72, 69, 78, 0, 0 }, { 90, 72, 69, 78, 71, 0 }, { 90, 72, 73, 0, 0, 0 }, 227 { 90, 72, 79, 78, 71, 0 }, { 90, 72, 79, 85, 0, 0 }, { 90, 72, 85, 0, 0, 0 }, 228 { 90, 72, 85, 65, 0, 0 }, { 90, 72, 85, 65, 73, 0 }, { 90, 72, 85, 65, 78, 0 }, 229 { 90, 72, 85, 65, 78, 71 }, { 90, 72, 85, 73, 0, 0 }, { 90, 72, 85, 78, 0, 0 }, 230 { 90, 72, 85, 79, 0, 0 }, { 90, 73, 0, 0, 0, 0 }, { 90, 79, 78, 71, 0, 0 }, 231 { 90, 79, 85, 0, 0, 0 }, { 90, 85, 0, 0, 0, 0 }, { 90, 85, 65, 78, 0, 0 }, 232 { 90, 85, 73, 0, 0, 0 }, { 90, 85, 78, 0, 0, 0 }, { 90, 85, 79, 0, 0, 0 }, }; 233 /** First and last Chinese character with known Pinyin according to zh collation */ 234 private static final String FIRST_PINYIN_UNIHAN = "\u963F"; 235 private static final String LAST_PINYIN_UNIHAN = "\u84D9"; 236 /** The first Chinese character in Unicode block */ 237 private static final char FIRST_UNIHAN = '\u3400'; 238 private static final Collator COLLATOR = Collator.getInstance(Locale.CHINA); 239 private static HanziToPinyin sInstance; 240 private final boolean mHasChinaCollator; 241 public static class Token { 242 /** 243 * Separator between target string for each source char 244 */ 245 public static final String SEPARATOR = " "; 246 public static final int LATIN = 1; 247 public static final int PINYIN = 2; 248 public static final int UNKNOWN = 3; 249 public Token() { 250 } 251 public Token(int type, String source, String target) { 252 this.type = type; 253 this.source = source; 254 this.target = target; 255 } 256 /** 257 * Type of this token, ASCII, PINYIN or UNKNOWN. 258 */ 259 public int type; 260 /** 261 * Original string before translation. 262 */ 263 public String source; 264 /** 265 * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is 266 * original string in source. 267 */ 268 public String target; 269 } 270 protected HanziToPinyin(boolean hasChinaCollator) { 271 mHasChinaCollator = hasChinaCollator; 272 } 273 public static HanziToPinyin getInstance() { 274 synchronized (HanziToPinyin.class) { 275 if (sInstance != null) { 276 return sInstance; 277 } 278 // Check if zh_CN collation data is available 279 final Locale locale[] = Collator.getAvailableLocales(); 280 for (int i = 0; i < locale.length; i++) { 281 if (locale[i].equals(Locale.CHINA)) { 282 // Do self validation just once. 283 if (DEBUG) { 284 Log.d(TAG, "Self validation. Result: " + doSelfValidation()); 285 } 286 sInstance = new HanziToPinyin(true); 287 return sInstance; 288 } 289 } 290 Log.w(TAG, "There is no Chinese collator, HanziToPinyin is disabled"); 291 sInstance = new HanziToPinyin(false); 292 return sInstance; 293 } 294 } 295 /** 296 * Validate if our internal table has some wrong value. 297 * 298 * @return true when the table looks correct. 299 */ 300 private static boolean doSelfValidation() { 301 char lastChar = UNIHANS[0]; 302 String lastString = Character.toString(lastChar); 303 for (char c : UNIHANS) { 304 if (lastChar == c) { 305 continue; 306 } 307 final String curString = Character.toString(c); 308 int cmp = COLLATOR.compare(lastString, curString); 309 if (cmp >= 0) { 310 Log.e(TAG, "Internal error in Unihan table. " + "The last string \"" + lastString 311 + "\" is greater than current string \"" + curString + "\"."); 312 return false; 313 } 314 lastString = curString; 315 } 316 return true; 317 } 318 private Token getToken(char character) { 319 Token token = new Token(); 320 final String letter = Character.toString(character); 321 token.source = letter; 322 int offset = -1; 323 int cmp; 324 if (character < 256) { 325 token.type = Token.LATIN; 326 token.target = letter; 327 return token; 328 } else if (character < FIRST_UNIHAN) { 329 token.type = Token.UNKNOWN; 330 token.target = letter; 331 return token; 332 } else { 333 cmp = COLLATOR.compare(letter, FIRST_PINYIN_UNIHAN); 334 if (cmp < 0) { 335 token.type = Token.UNKNOWN; 336 token.target = letter; 337 return token; 338 } else if (cmp == 0) { 339 token.type = Token.PINYIN; 340 offset = 0; 341 } else { 342 cmp = COLLATOR.compare(letter, LAST_PINYIN_UNIHAN); 343 if (cmp > 0) { 344 token.type = Token.UNKNOWN; 345 token.target = letter; 346 return token; 347 } else if (cmp == 0) { 348 token.type = Token.PINYIN; 349 offset = UNIHANS.length - 1; 350 } 351 } 352 } 353 token.type = Token.PINYIN; 354 if (offset < 0) { 355 int begin = 0; 356 int end = UNIHANS.length - 1; 357 while (begin <= end) { 358 offset = (begin + end) / 2; 359 final String unihan = Character.toString(UNIHANS[offset]); 360 cmp = COLLATOR.compare(letter, unihan); 361 if (cmp == 0) { 362 break; 363 } else if (cmp > 0) { 364 begin = offset + 1; 365 } else { 366 end = offset - 1; 367 } 368 } 369 } 370 if (cmp < 0) { 371 offset--; 372 } 373 StringBuilder pinyin = new StringBuilder(); 374 for (int j = 0; j < PINYINS[offset].length && PINYINS[offset][j] != 0; j++) { 375 pinyin.append((char) PINYINS[offset][j]); 376 } 377 token.target = pinyin.toString(); 378 return token; 379 } 380 /** 381 * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without 382 * space will be put into a Token, One Hanzi character which has pinyin will be treated as a 383 * Token. If these is no China collator, the empty token array is returned. 384 */ 385 public ArrayList<Token> get(final String input) { 386 ArrayList<Token> tokens = new ArrayList<Token>(); 387 if (!mHasChinaCollator || TextUtils.isEmpty(input)) { 388 // return empty tokens. 389 return tokens; 390 } 391 final int inputLength = input.length(); 392 final StringBuilder sb = new StringBuilder(); 393 int tokenType = Token.LATIN; 394 // Go through the input, create a new token when 395 // a. Token type changed 396 // b. Get the Pinyin of current charater. 397 // c. current character is space. 398 for (int i = 0; i < inputLength; i++) { 399 final char character = input.charAt(i); 400 if (character == ' ') { 401 if (sb.length() > 0) { 402 addToken(sb, tokens, tokenType); 403 } 404 } else if (character < 256) { 405 if (tokenType != Token.LATIN && sb.length() > 0) { 406 addToken(sb, tokens, tokenType); 407 } 408 tokenType = Token.LATIN; 409 sb.append(character); 410 } else if (character < FIRST_UNIHAN) { 411 if (tokenType != Token.UNKNOWN && sb.length() > 0) { 412 addToken(sb, tokens, tokenType); 413 } 414 tokenType = Token.UNKNOWN; 415 sb.append(character); 416 } else { 417 Token t = getToken(character); 418 if (t.type == Token.PINYIN) { 419 if (sb.length() > 0) { 420 addToken(sb, tokens, tokenType); 421 } 422 tokens.add(t); 423 tokenType = Token.PINYIN; 424 } else { 425 if (tokenType != t.type && sb.length() > 0) { 426 addToken(sb, tokens, tokenType); 427 } 428 tokenType = t.type; 429 sb.append(character); 430 } 431 } 432 } 433 if (sb.length() > 0) { 434 addToken(sb, tokens, tokenType); 435 } 436 return tokens; 437 } 438 private void addToken( 439 final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) { 440 String str = sb.toString(); 441 tokens.add(new Token(tokenType, str, str)); 442 sb.setLength(0); 443 } 444 445 //The fillowing lines are provided and maintained by Mediatek inc. 446 private class DialerSearchToken extends Token { 447 static final int FIRSTCASE = 0; 448 static final int UPPERCASE = 1; 449 static final int LOWERCASE = 2; 450 } 451 452 public String getTokensForDialerSearch(final String input, StringBuilder offsets){ 453 454 if(offsets == null || input == null || TextUtils.isEmpty(input)){ 455 // return empty tokens 456 return null; 457 } 458 459 StringBuilder subStrSet = new StringBuilder(); 460 ArrayList<Token> tokens = new ArrayList<Token>(); 461 ArrayList<String> shortSubStrOffset = new ArrayList<String>(); 462 final int inputLength = input.length(); 463 final StringBuilder subString = new StringBuilder(); 464 final StringBuilder subStrOffset = new StringBuilder(); 465 int tokenType = Token.LATIN; 466 int caseTypePre = DialerSearchToken.FIRSTCASE; 467 int caseTypeCurr = DialerSearchToken.UPPERCASE; 468 int mPos = 0; 469 470 // Go through the input, create a new token when 471 // a. Token type changed 472 // b. Get the Pinyin of current charater. 473 // c. current character is space. 474 // d. Token case changed from lower case to upper case, 475 // e. the first character is always a separated one 476 // f character == '+' || character == '#' || character == '*' || character == ',' || character == ';' 477 for (int i = 0; i < inputLength; i++) { 478 final char character = input.charAt(i); 479 if (character == '-' || character == ',' ){ 480 mPos++; 481 } else if (character == ' ') { 482 if (subString.length() > 0) { 483 addToken(subString, tokens, tokenType); 484 addOffsets(subStrOffset, shortSubStrOffset); 485 } 486 addSubString(tokens,shortSubStrOffset,subStrSet,offsets); 487 mPos++; 488 caseTypePre = DialerSearchToken.FIRSTCASE; 489 } else if (character < 256) { 490 if (tokenType != Token.LATIN && subString.length() > 0) { 491 addToken(subString, tokens, tokenType); 492 addOffsets(subStrOffset, shortSubStrOffset); 493 } 494 caseTypeCurr = (character>='A' && character<='Z')?DialerSearchToken.UPPERCASE:DialerSearchToken.LOWERCASE; 495 if(caseTypePre == DialerSearchToken.LOWERCASE && caseTypeCurr == DialerSearchToken.UPPERCASE){ 496 addToken(subString, tokens, tokenType); 497 addOffsets(subStrOffset, shortSubStrOffset); 498 } 499 caseTypePre = caseTypeCurr; 500 tokenType = Token.LATIN; 501 Character c = Character.toUpperCase(character); 502 if(c != null){ 503 subString.append(c); 504 subStrOffset.append((char)mPos); 505 } 506 mPos++; 507 } else if (character < FIRST_UNIHAN) { 508 //Comment out. Do not cover unknown characters SINCE they can not be input. 509 // if (tokenType != Token.UNKNOWN && subString.length() > 0) { 510 // addToken(subString, tokens, tokenType); 511 // addOffsets(subStrOffset, shortSubStrOffset); 512 // caseTypePre = Token.FIRSTCASE; 513 // } 514 // tokenType = Token.UNKNOWN; 515 // Character c = Character.toUpperCase(character); 516 // if(c != null){ 517 // subString.append(c); 518 // subStrOffset.append((char)(mPos)); 519 // } 520 mPos++; 521 } else { 522 Token t = getToken(character); 523 int tokenSize = t.target.length(); 524 //Current type is PINYIN 525 if (t.type == Token.PINYIN) { 526 if (subString.length() > 0) { 527 addToken(subString, tokens, tokenType); 528 addOffsets(subStrOffset, shortSubStrOffset); 529 } 530 tokens.add(t); 531 for(int j=0; j < tokenSize;j++) 532 subStrOffset.append((char)mPos); 533 addOffsets(subStrOffset,shortSubStrOffset); 534 tokenType = Token.PINYIN; 535 caseTypePre = DialerSearchToken.FIRSTCASE; 536 mPos++; 537 } else { 538 //Comment out. Do not cover special characters SINCE they can not be input. 539 // if (tokenType != t.type && subString.length() > 0) { 540 // addToken(subString, tokens, tokenType); 541 // addOffsets(subStrOffset, shortSubStrOffset); 542 // caseTypePre = Token.FIRSTCASE; 543 // }else{ 544 // caseTypeCurr = (character>='A' && character<='Z')?Token.UPPERCASE:Token.LOWERCASE; 545 // if(caseTypePre == Token.LOWERCASE && caseTypeCurr == Token.UPPERCASE){ 546 // addToken(subString, tokens, tokenType); 547 // addOffsets(subStrOffset, shortSubStrOffset); 548 // } 549 // caseTypePre = caseTypeCurr; 550 // } 551 // tokenType = t.type; 552 // Character c = Character.toUpperCase(character); 553 // if(c != null){ 554 // subString.append(c); 555 // subStrOffset.append(mPos); 556 // } 557 mPos++; 558 } 559 } 560 //IF the name string is too long, cut it off to meet the storage request of dialer search. 561 if(mPos > 127) 562 break; 563 } 564 if (subString.length() > 0) { 565 addToken(subString, tokens, tokenType); 566 addOffsets(subStrOffset, shortSubStrOffset); 567 } 568 addSubString(tokens,shortSubStrOffset,subStrSet,offsets); 569 return subStrSet.toString(); 570 } 571 572 private void addOffsets(final StringBuilder sb, final ArrayList<String> shortSubStrOffset){ 573 String str = sb.toString(); 574 shortSubStrOffset.add(str); 575 sb.setLength(0); 576 } 577 578 private void addSubString(final ArrayList<Token> tokens, final ArrayList<String> shortSubStrOffset, 579 StringBuilder subStrSet, StringBuilder offsets){ 580 if(tokens == null || tokens.isEmpty()) 581 return; 582 583 int size = tokens.size(); 584 int len = 0; 585 StringBuilder mShortSubStr = new StringBuilder(); 586 StringBuilder mShortSubStrOffsets = new StringBuilder(); 587 StringBuilder mShortSubStrSet = new StringBuilder(); 588 StringBuilder mShortSubStrOffsetsSet = new StringBuilder(); 589 590 for(int i=size-1; i>=0 ; i--){ 591 String mTempStr = tokens.get(i).target; 592 len += mTempStr.length(); 593 String mTempOffset = shortSubStrOffset.get(i); 594 if(mShortSubStr.length()>0){ 595 mShortSubStr.deleteCharAt(0); 596 mShortSubStrOffsets.deleteCharAt(0); 597 } 598 mShortSubStr.insert(0, mTempStr); 599 mShortSubStr.insert(0,(char)len); 600 mShortSubStrOffsets.insert(0,mTempOffset); 601 mShortSubStrOffsets.insert(0,(char)len); 602 mShortSubStrSet.insert(0,mShortSubStr); 603 mShortSubStrOffsetsSet.insert(0, mShortSubStrOffsets); 604 } 605 606 subStrSet.append(mShortSubStrSet); 607 offsets.append(mShortSubStrOffsetsSet); 608 tokens.clear(); 609 shortSubStrOffset.clear(); 610 } 611 //The previous lines are provided and maintained by Mediatek inc. 612 }
调用 input是输入的汉字
public static String getPinYin2(String input) { ArrayList<Token> tokens = HanziToPinyin.getInstance().get(input); StringBuilder sb = new StringBuilder(); if (tokens != null && tokens.size() > 0) { for (Token token : tokens) { if (Token.PINYIN == token.type) { sb.append(token.target); } else { sb.append(token.source); } } } return sb.toString().toUpperCase(); }
浙公网安备 33010602011771号