1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.providers.contacts; 18 19 import android.text.TextUtils; 20 import android.util.Log; 21 22 import java.util.ArrayList; 23 import java.util.Locale; 24 25 import libcore.icu.Transliterator; 26 27 /** 28 * An object to convert Chinese character to its corresponding pinyin string. 29 * For characters with multiple possible pinyin string, only one is selected 30 * according to ICU Transliterator class. Polyphone is not supported in this 31 * implementation. 32 */ 33 public class HanziToPinyin { 34 private static final String TAG = "HanziToPinyin"; 35 36 private static HanziToPinyin sInstance; 37 private Transliterator mPinyinTransliterator; 38 private Transliterator mAsciiTransliterator; 39 40 public static class Token { 41 /** 42 * Separator between target string for each source char 43 */ 44 public static final String SEPARATOR = " "; 45 46 public static final int LATIN = 1; 47 public static final int PINYIN = 2; 48 public static final int UNKNOWN = 3; 49 Token()50 public Token() { 51 } 52 Token(int type, String source, String target)53 public Token(int type, String source, String target) { 54 this.type = type; 55 this.source = source; 56 this.target = target; 57 } 58 59 /** 60 * Type of this token, ASCII, PINYIN or UNKNOWN. 61 */ 62 public int type; 63 /** 64 * Original string before translation. 65 */ 66 public String source; 67 /** 68 * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is 69 * original string in source. 70 */ 71 public String target; 72 } 73 HanziToPinyin()74 private HanziToPinyin() { 75 try { 76 mPinyinTransliterator = new Transliterator("Han-Latin/Names; Latin-Ascii; Any-Upper"); 77 mAsciiTransliterator = new Transliterator("Latin-Ascii"); 78 } catch (RuntimeException e) { 79 Log.w(TAG, "Han-Latin/Names transliterator data is missing," 80 + " HanziToPinyin is disabled"); 81 } 82 } 83 hasChineseTransliterator()84 public boolean hasChineseTransliterator() { 85 return mPinyinTransliterator != null; 86 } 87 getInstance()88 public static HanziToPinyin getInstance() { 89 synchronized (HanziToPinyin.class) { 90 if (sInstance == null) { 91 sInstance = new HanziToPinyin(); 92 } 93 return sInstance; 94 } 95 } 96 tokenize(char character, Token token)97 private void tokenize(char character, Token token) { 98 token.source = Character.toString(character); 99 100 // ASCII 101 if (character < 128) { 102 token.type = Token.LATIN; 103 token.target = token.source; 104 return; 105 } 106 107 // Extended Latin. Transcode these to ASCII equivalents 108 if (character < 0x250 || (0x1e00 <= character && character < 0x1eff)) { 109 token.type = Token.LATIN; 110 token.target = mAsciiTransliterator == null ? token.source : 111 mAsciiTransliterator.transliterate(token.source); 112 return; 113 } 114 115 token.type = Token.PINYIN; 116 token.target = mPinyinTransliterator.transliterate(token.source); 117 if (TextUtils.isEmpty(token.target) || 118 TextUtils.equals(token.source, token.target)) { 119 token.type = Token.UNKNOWN; 120 token.target = token.source; 121 } 122 } 123 transliterate(final String input)124 public String transliterate(final String input) { 125 if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) { 126 return null; 127 } 128 return mPinyinTransliterator.transliterate(input); 129 } 130 131 /** 132 * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without 133 * space will be put into a Token, One Hanzi character which has pinyin will be treated as a 134 * Token. If there is no Chinese transliterator, the empty token array is returned. 135 */ getTokens(final String input)136 public ArrayList<Token> getTokens(final String input) { 137 ArrayList<Token> tokens = new ArrayList<Token>(); 138 if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) { 139 // return empty tokens. 140 return tokens; 141 } 142 143 final int inputLength = input.length(); 144 final StringBuilder sb = new StringBuilder(); 145 int tokenType = Token.LATIN; 146 Token token = new Token(); 147 148 // Go through the input, create a new token when 149 // a. Token type changed 150 // b. Get the Pinyin of current charater. 151 // c. current character is space. 152 for (int i = 0; i < inputLength; i++) { 153 final char character = input.charAt(i); 154 if (Character.isSpaceChar(character)) { 155 if (sb.length() > 0) { 156 addToken(sb, tokens, tokenType); 157 } 158 } else { 159 tokenize(character, token); 160 if (token.type == Token.PINYIN) { 161 if (sb.length() > 0) { 162 addToken(sb, tokens, tokenType); 163 } 164 tokens.add(token); 165 token = new Token(); 166 } else { 167 if (tokenType != token.type && sb.length() > 0) { 168 addToken(sb, tokens, tokenType); 169 } 170 sb.append(token.target); 171 } 172 tokenType = token.type; 173 } 174 } 175 if (sb.length() > 0) { 176 addToken(sb, tokens, tokenType); 177 } 178 return tokens; 179 } 180 addToken( final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType)181 private void addToken( 182 final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) { 183 String str = sb.toString(); 184 tokens.add(new Token(tokenType, str, str)); 185 sb.setLength(0); 186 } 187 } 188