1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.providers.contacts;
18 
19 import android.text.TextUtils;
20 import android.util.Log;
21 
22 import java.util.ArrayList;
23 import java.util.Locale;
24 
25 import libcore.icu.Transliterator;
26 
27 /**
28  * An object to convert Chinese character to its corresponding pinyin string.
29  * For characters with multiple possible pinyin string, only one is selected
30  * according to ICU Transliterator class. Polyphone is not supported in this
31  * implementation.
32  */
33 public class HanziToPinyin {
34     private static final String TAG = "HanziToPinyin";
35 
36     private static HanziToPinyin sInstance;
37     private Transliterator mPinyinTransliterator;
38     private Transliterator mAsciiTransliterator;
39 
40     public static class Token {
41         /**
42          * Separator between target string for each source char
43          */
44         public static final String SEPARATOR = " ";
45 
46         public static final int LATIN = 1;
47         public static final int PINYIN = 2;
48         public static final int UNKNOWN = 3;
49 
Token()50         public Token() {
51         }
52 
Token(int type, String source, String target)53         public Token(int type, String source, String target) {
54             this.type = type;
55             this.source = source;
56             this.target = target;
57         }
58 
59         /**
60          * Type of this token, ASCII, PINYIN or UNKNOWN.
61          */
62         public int type;
63         /**
64          * Original string before translation.
65          */
66         public String source;
67         /**
68          * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is
69          * original string in source.
70          */
71         public String target;
72     }
73 
HanziToPinyin()74     private HanziToPinyin() {
75         try {
76             mPinyinTransliterator = new Transliterator("Han-Latin/Names; Latin-Ascii; Any-Upper");
77             mAsciiTransliterator = new Transliterator("Latin-Ascii");
78         } catch (RuntimeException e) {
79             Log.w(TAG, "Han-Latin/Names transliterator data is missing,"
80                   + " HanziToPinyin is disabled");
81         }
82     }
83 
hasChineseTransliterator()84     public boolean hasChineseTransliterator() {
85         return mPinyinTransliterator != null;
86     }
87 
getInstance()88     public static HanziToPinyin getInstance() {
89         synchronized (HanziToPinyin.class) {
90             if (sInstance == null) {
91                 sInstance = new HanziToPinyin();
92             }
93             return sInstance;
94         }
95     }
96 
tokenize(char character, Token token)97     private void tokenize(char character, Token token) {
98         token.source = Character.toString(character);
99 
100         // ASCII
101         if (character < 128) {
102             token.type = Token.LATIN;
103             token.target = token.source;
104             return;
105         }
106 
107         // Extended Latin. Transcode these to ASCII equivalents
108         if (character < 0x250 || (0x1e00 <= character && character < 0x1eff)) {
109             token.type = Token.LATIN;
110             token.target = mAsciiTransliterator == null ? token.source :
111                 mAsciiTransliterator.transliterate(token.source);
112             return;
113         }
114 
115         token.type = Token.PINYIN;
116         token.target = mPinyinTransliterator.transliterate(token.source);
117         if (TextUtils.isEmpty(token.target) ||
118             TextUtils.equals(token.source, token.target)) {
119             token.type = Token.UNKNOWN;
120             token.target = token.source;
121         }
122     }
123 
transliterate(final String input)124     public String transliterate(final String input) {
125         if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
126             return null;
127         }
128         return mPinyinTransliterator.transliterate(input);
129     }
130 
131     /**
132      * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without
133      * space will be put into a Token, One Hanzi character which has pinyin will be treated as a
134      * Token. If there is no Chinese transliterator, the empty token array is returned.
135      */
getTokens(final String input)136     public ArrayList<Token> getTokens(final String input) {
137         ArrayList<Token> tokens = new ArrayList<Token>();
138         if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
139             // return empty tokens.
140             return tokens;
141         }
142 
143         final int inputLength = input.length();
144         final StringBuilder sb = new StringBuilder();
145         int tokenType = Token.LATIN;
146         Token token = new Token();
147 
148         // Go through the input, create a new token when
149         // a. Token type changed
150         // b. Get the Pinyin of current charater.
151         // c. current character is space.
152         for (int i = 0; i < inputLength; i++) {
153             final char character = input.charAt(i);
154             if (Character.isSpaceChar(character)) {
155                 if (sb.length() > 0) {
156                     addToken(sb, tokens, tokenType);
157                 }
158             } else {
159                 tokenize(character, token);
160                 if (token.type == Token.PINYIN) {
161                     if (sb.length() > 0) {
162                         addToken(sb, tokens, tokenType);
163                     }
164                     tokens.add(token);
165                     token = new Token();
166                 } else {
167                     if (tokenType != token.type && sb.length() > 0) {
168                         addToken(sb, tokens, tokenType);
169                     }
170                     sb.append(token.target);
171                 }
172                 tokenType = token.type;
173             }
174         }
175         if (sb.length() > 0) {
176             addToken(sb, tokens, tokenType);
177         }
178         return tokens;
179     }
180 
addToken( final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType)181     private void addToken(
182             final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) {
183         String str = sb.toString();
184         tokens.add(new Token(tokenType, str, str));
185         sb.setLength(0);
186     }
187 }
188