1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16 package com.android.providers.contacts;
17 
18 import android.content.ContentValues;
19 import android.provider.ContactsContract.CommonDataKinds.StructuredName;
20 import android.provider.ContactsContract.FullNameStyle;
21 import android.provider.ContactsContract.PhoneticNameStyle;
22 import android.text.TextUtils;
23 
24 import com.android.providers.contacts.util.NeededForTesting;
25 
26 import java.lang.Character.UnicodeBlock;
27 import java.util.HashSet;
28 import java.util.Locale;
29 import java.util.StringTokenizer;
30 
31 /**
32  * The purpose of this class is to split a full name into given names and last
33  * name. The logic only supports having a single last name. If the full name has
34  * multiple last names the output will be incorrect.
35  * <p>
36  * Core algorithm:
37  * <ol>
38  * <li>Remove the suffixes (III, Ph.D., M.D.).</li>
39  * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li>
40  * <li>Assign the last remaining token as the last name.</li>
41  * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use
42  * this word also as the last name.</li>
43  * <li>Assign the rest of the words as the "given names".</li>
44  * </ol>
45  */
46 public class NameSplitter {
47 
48     public static final int MAX_TOKENS = 10;
49 
50     private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase();
51     private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase();
52 
53     // This includes simplified and traditional Chinese
54     private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase();
55 
56     private final HashSet<String> mPrefixesSet;
57     private final HashSet<String> mSuffixesSet;
58     private final int mMaxSuffixLength;
59     private final HashSet<String> mLastNamePrefixesSet;
60     private final HashSet<String> mConjuctions;
61     private final Locale mLocale;
62     private final String mLanguage;
63 
64     /**
65      * Two-Chracter long Korean family names.
66      * http://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EB%B3%B5%EC%84%B1
67      */
68     private static final String[] KOREAN_TWO_CHARCTER_FAMILY_NAMES = {
69         "\uAC15\uC804", // Gang Jeon
70         "\uB0A8\uAD81", // Nam Goong
71         "\uB3C5\uACE0", // Dok Go
72         "\uB3D9\uBC29", // Dong Bang
73         "\uB9DD\uC808", // Mang Jeol
74         "\uC0AC\uACF5", // Sa Gong
75         "\uC11C\uBB38", // Seo Moon
76         "\uC120\uC6B0", // Seon Woo
77         "\uC18C\uBD09", // So Bong
78         "\uC5B4\uAE08", // Uh Geum
79         "\uC7A5\uACE1", // Jang Gok
80         "\uC81C\uAC08", // Je Gal
81         "\uD669\uBCF4"  // Hwang Bo
82     };
83 
84     public static class Name {
85         public String prefix;
86         public String givenNames;
87         public String middleName;
88         public String familyName;
89         public String suffix;
90 
91         public int fullNameStyle;
92 
93         public String phoneticFamilyName;
94         public String phoneticMiddleName;
95         public String phoneticGivenName;
96 
97         public int phoneticNameStyle;
98 
Name()99         public Name() {
100         }
101 
Name(String prefix, String givenNames, String middleName, String familyName, String suffix)102         public Name(String prefix, String givenNames, String middleName, String familyName,
103                 String suffix) {
104             this.prefix = prefix;
105             this.givenNames = givenNames;
106             this.middleName = middleName;
107             this.familyName = familyName;
108             this.suffix = suffix;
109         }
110 
111         @NeededForTesting
getPrefix()112         public String getPrefix() {
113             return prefix;
114         }
115 
getGivenNames()116         public String getGivenNames() {
117             return givenNames;
118         }
119 
getMiddleName()120         public String getMiddleName() {
121             return middleName;
122         }
123 
getFamilyName()124         public String getFamilyName() {
125             return familyName;
126         }
127 
128         @NeededForTesting
getSuffix()129         public String getSuffix() {
130             return suffix;
131         }
132 
getFullNameStyle()133         public int getFullNameStyle() {
134             return fullNameStyle;
135         }
136 
getPhoneticFamilyName()137         public String getPhoneticFamilyName() {
138             return phoneticFamilyName;
139         }
140 
getPhoneticMiddleName()141         public String getPhoneticMiddleName() {
142             return phoneticMiddleName;
143         }
144 
getPhoneticGivenName()145         public String getPhoneticGivenName() {
146             return phoneticGivenName;
147         }
148 
getPhoneticNameStyle()149         public int getPhoneticNameStyle() {
150             return phoneticNameStyle;
151         }
152 
fromValues(ContentValues values)153         public void fromValues(ContentValues values) {
154             prefix = values.getAsString(StructuredName.PREFIX);
155             givenNames = values.getAsString(StructuredName.GIVEN_NAME);
156             middleName = values.getAsString(StructuredName.MIDDLE_NAME);
157             familyName = values.getAsString(StructuredName.FAMILY_NAME);
158             suffix = values.getAsString(StructuredName.SUFFIX);
159 
160             Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE);
161             fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer;
162 
163             phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME);
164             phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME);
165             phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME);
166 
167             integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE);
168             phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer;
169         }
170 
toValues(ContentValues values)171         public void toValues(ContentValues values) {
172             putValueIfPresent(values, StructuredName.PREFIX, prefix);
173             putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames);
174             putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName);
175             putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName);
176             putValueIfPresent(values, StructuredName.SUFFIX, suffix);
177             values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle);
178             putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName);
179             putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName);
180             putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName);
181             values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle);
182         }
183 
putValueIfPresent(ContentValues values, String name, String value)184         private void putValueIfPresent(ContentValues values, String name, String value) {
185             if (value != null) {
186                 values.put(name, value);
187             }
188         }
189 
clear()190         public void clear() {
191             prefix = null;
192             givenNames = null;
193             middleName = null;
194             familyName = null;
195             suffix = null;
196             fullNameStyle = FullNameStyle.UNDEFINED;
197             phoneticFamilyName = null;
198             phoneticMiddleName = null;
199             phoneticGivenName = null;
200             phoneticNameStyle = PhoneticNameStyle.UNDEFINED;
201         }
202 
isEmpty()203         public boolean isEmpty() {
204             return TextUtils.isEmpty(givenNames)
205                     && TextUtils.isEmpty(middleName)
206                     && TextUtils.isEmpty(familyName)
207                     && TextUtils.isEmpty(suffix)
208                     && TextUtils.isEmpty(phoneticFamilyName)
209                     && TextUtils.isEmpty(phoneticMiddleName)
210                     && TextUtils.isEmpty(phoneticGivenName);
211         }
212 
213         @Override
toString()214         public String toString() {
215             return "[prefix: " + prefix + " given: " + givenNames + " middle: " + middleName
216                     + " family: " + familyName + " suffix: " + suffix + " ph/given: "
217                     + phoneticGivenName + " ph/middle: " + phoneticMiddleName + " ph/family: "
218                     + phoneticFamilyName + "]";
219         }
220     }
221 
222     private static class NameTokenizer extends StringTokenizer {
223         private final String[] mTokens;
224         private int mDotBitmask;
225         private int mCommaBitmask;
226         private int mStartPointer;
227         private int mEndPointer;
228 
NameTokenizer(String fullName)229         public NameTokenizer(String fullName) {
230             super(fullName, " .,", true);
231 
232             mTokens = new String[MAX_TOKENS];
233 
234             // Iterate over tokens, skipping over empty ones and marking tokens that
235             // are followed by dots.
236             while (hasMoreTokens() && mEndPointer < MAX_TOKENS) {
237                 final String token = nextToken();
238                 if (token.length() > 0) {
239                     final char c = token.charAt(0);
240                     if (c == ' ') {
241                         continue;
242                     }
243                 }
244 
245                 if (mEndPointer > 0 && token.charAt(0) == '.') {
246                     mDotBitmask |= (1 << (mEndPointer - 1));
247                 } else if (mEndPointer > 0 && token.charAt(0) == ',') {
248                     mCommaBitmask |= (1 << (mEndPointer - 1));
249                 } else {
250                     mTokens[mEndPointer] = token;
251                     mEndPointer++;
252                 }
253             }
254         }
255 
256         /**
257          * Returns true if the token is followed by a dot in the original full name.
258          */
hasDot(int index)259         public boolean hasDot(int index) {
260             return (mDotBitmask & (1 << index)) != 0;
261         }
262 
263         /**
264          * Returns true if the token is followed by a comma in the original full name.
265          */
hasComma(int index)266         public boolean hasComma(int index) {
267             return (mCommaBitmask & (1 << index)) != 0;
268         }
269     }
270 
271     /**
272      * Constructor.
273      *
274      * @param commonPrefixes comma-separated list of common prefixes,
275      *            e.g. "Mr, Ms, Mrs"
276      * @param commonLastNamePrefixes comma-separated list of common last name prefixes,
277      *            e.g. "d', st, st., von"
278      * @param commonSuffixes comma-separated list of common suffixes,
279      *            e.g. "Jr, M.D., MD, D.D.S."
280      * @param commonConjunctions comma-separated list of common conjuctions,
281      *            e.g. "AND, Or"
282      */
NameSplitter(String commonPrefixes, String commonLastNamePrefixes, String commonSuffixes, String commonConjunctions, Locale locale)283     public NameSplitter(String commonPrefixes, String commonLastNamePrefixes,
284             String commonSuffixes, String commonConjunctions, Locale locale) {
285         // TODO: refactor this to use <string-array> resources
286         mPrefixesSet = convertToSet(commonPrefixes);
287         mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes);
288         mSuffixesSet = convertToSet(commonSuffixes);
289         mConjuctions = convertToSet(commonConjunctions);
290         mLocale = locale != null ? locale : Locale.getDefault();
291         mLanguage = mLocale.getLanguage().toLowerCase();
292 
293         int maxLength = 0;
294         for (String suffix : mSuffixesSet) {
295             if (suffix.length() > maxLength) {
296                 maxLength = suffix.length();
297             }
298         }
299 
300         mMaxSuffixLength = maxLength;
301     }
302 
303     /**
304      * Converts a comma-separated list of Strings to a set of Strings. Trims strings
305      * and converts them to upper case.
306      */
convertToSet(String strings)307     private static HashSet<String> convertToSet(String strings) {
308         HashSet<String> set = new HashSet<String>();
309         if (strings != null) {
310             String[] split = strings.split(",");
311             for (int i = 0; i < split.length; i++) {
312                 set.add(split[i].trim().toUpperCase());
313             }
314         }
315         return set;
316     }
317 
318     /**
319      * Parses a full name and returns components as a list of tokens.
320      */
tokenize(String[] tokens, String fullName)321     public int tokenize(String[] tokens, String fullName) {
322         if (fullName == null) {
323             return 0;
324         }
325 
326         NameTokenizer tokenizer = new NameTokenizer(fullName);
327 
328         if (tokenizer.mStartPointer == tokenizer.mEndPointer) {
329             return 0;
330         }
331 
332         String firstToken = tokenizer.mTokens[tokenizer.mStartPointer];
333         int count = 0;
334         for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) {
335             tokens[count++] = tokenizer.mTokens[i];
336         }
337 
338         return count;
339     }
340 
341 
342     /**
343      * Parses a full name and returns parsed components in the Name object.
344      */
split(Name name, String fullName)345     public void split(Name name, String fullName) {
346         if (fullName == null) {
347             return;
348         }
349 
350         int fullNameStyle = guessFullNameStyle(fullName);
351         if (fullNameStyle == FullNameStyle.CJK) {
352             fullNameStyle = getAdjustedFullNameStyle(fullNameStyle);
353         }
354 
355         split(name, fullName, fullNameStyle);
356     }
357 
358     /**
359      * Parses a full name and returns parsed components in the Name object
360      * with a given fullNameStyle.
361      */
split(Name name, String fullName, int fullNameStyle)362     public void split(Name name, String fullName, int fullNameStyle) {
363         if (fullName == null) {
364             return;
365         }
366 
367         name.fullNameStyle = fullNameStyle;
368 
369         switch (fullNameStyle) {
370             case FullNameStyle.CHINESE:
371                 splitChineseName(name, fullName);
372                 break;
373 
374             case FullNameStyle.JAPANESE:
375                 splitJapaneseName(name, fullName);
376                 break;
377 
378             case FullNameStyle.KOREAN:
379                 splitKoreanName(name, fullName);
380                 break;
381 
382             default:
383                 splitWesternName(name, fullName);
384         }
385     }
386 
387     /**
388      * Splits a full name composed according to the Western tradition:
389      * <pre>
390      *   [prefix] given name(s) [[middle name] family name] [, suffix]
391      *   [prefix] family name, given name [middle name] [,suffix]
392      * </pre>
393      */
splitWesternName(Name name, String fullName)394     private void splitWesternName(Name name, String fullName) {
395         NameTokenizer tokens = new NameTokenizer(fullName);
396         parsePrefix(name, tokens);
397 
398         // If the name consists of just one or two tokens, treat them as first/last name,
399         // not as suffix.  Example: John Ma; Ma is last name, not "M.A.".
400         if (tokens.mEndPointer > 2) {
401             parseSuffix(name, tokens);
402         }
403 
404         if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) {
405             name.givenNames = tokens.mTokens[tokens.mStartPointer];
406         } else {
407             parseLastName(name, tokens);
408             parseMiddleName(name, tokens);
409             parseGivenNames(name, tokens);
410         }
411     }
412 
413     /**
414      * Splits a full name composed according to the Chinese tradition:
415      * <pre>
416      *   [family name [middle name]] given name
417      * </pre>
418      */
splitChineseName(Name name, String fullName)419     private void splitChineseName(Name name, String fullName) {
420         StringTokenizer tokenizer = new StringTokenizer(fullName);
421         while (tokenizer.hasMoreTokens()) {
422             String token = tokenizer.nextToken();
423             if (name.givenNames == null) {
424                 name.givenNames = token;
425             } else if (name.familyName == null) {
426                 name.familyName = name.givenNames;
427                 name.givenNames = token;
428             } else if (name.middleName == null) {
429                 name.middleName = name.givenNames;
430                 name.givenNames = token;
431             } else {
432                 name.middleName = name.middleName + name.givenNames;
433                 name.givenNames = token;
434             }
435         }
436 
437         // If a single word parse that word up.
438         if (name.givenNames != null && name.familyName == null && name.middleName == null) {
439             int length = fullName.length();
440             if (length == 2) {
441                 name.familyName = fullName.substring(0, 1);
442                 name.givenNames = fullName.substring(1);
443             } else if (length == 3) {
444                 name.familyName = fullName.substring(0, 1);
445                 name.middleName = fullName.substring(1, 2);
446                 name.givenNames = fullName.substring(2);
447             } else if (length == 4) {
448                 name.familyName = fullName.substring(0, 2);
449                 name.middleName = fullName.substring(2, 3);
450                 name.givenNames = fullName.substring(3);
451             }
452 
453         }
454     }
455 
456     /**
457      * Splits a full name composed according to the Japanese tradition:
458      * <pre>
459      *   [family name] given name(s)
460      * </pre>
461      */
splitJapaneseName(Name name, String fullName)462     private void splitJapaneseName(Name name, String fullName) {
463         StringTokenizer tokenizer = new StringTokenizer(fullName);
464         while (tokenizer.hasMoreTokens()) {
465             String token = tokenizer.nextToken();
466             if (name.givenNames == null) {
467                 name.givenNames = token;
468             } else if (name.familyName == null) {
469                 name.familyName = name.givenNames;
470                 name.givenNames = token;
471             } else {
472                 name.givenNames += " " + token;
473             }
474         }
475     }
476 
477     /**
478      * Splits a full name composed according to the Korean tradition:
479      * <pre>
480      *   [family name] given name(s)
481      * </pre>
482      */
splitKoreanName(Name name, String fullName)483     private void splitKoreanName(Name name, String fullName) {
484         StringTokenizer tokenizer = new StringTokenizer(fullName);
485         if (tokenizer.countTokens() > 1) {
486             // Each name can be identified by separators.
487             while (tokenizer.hasMoreTokens()) {
488                 String token = tokenizer.nextToken();
489                 if (name.givenNames == null) {
490                     name.givenNames = token;
491                 } else if (name.familyName == null) {
492                     name.familyName = name.givenNames;
493                     name.givenNames = token;
494                 } else {
495                     name.givenNames += " " + token;
496                 }
497             }
498         } else {
499             // There is no separator. Try to guess family name.
500             // The length of most family names is 1.
501             int familyNameLength = 1;
502 
503             // Compare with 2-length family names.
504             for (String twoLengthFamilyName : KOREAN_TWO_CHARCTER_FAMILY_NAMES) {
505                 if (fullName.startsWith(twoLengthFamilyName)) {
506                     familyNameLength = 2;
507                     break;
508                 }
509             }
510 
511             name.familyName = fullName.substring(0, familyNameLength);
512             if (fullName.length() > familyNameLength) {
513                 name.givenNames = fullName.substring(familyNameLength);
514             }
515         }
516     }
517 
518     /**
519      * Concatenates components of a name according to the rules dictated by the name style.
520      *
521      * @param givenNameFirst is ignored for CJK display name styles
522      */
join(Name name, boolean givenNameFirst, boolean includePrefix)523     public String join(Name name, boolean givenNameFirst, boolean includePrefix) {
524         String prefix = includePrefix ? name.prefix : null;
525         switch (name.fullNameStyle) {
526             case FullNameStyle.CJK:
527             case FullNameStyle.CHINESE:
528             case FullNameStyle.KOREAN:
529                 return join(prefix, name.familyName, name.middleName, name.givenNames,
530                         name.suffix, false, false, false);
531 
532             case FullNameStyle.JAPANESE:
533                 return join(prefix, name.familyName, name.middleName, name.givenNames,
534                         name.suffix, true, false, false);
535 
536             default:
537                 if (givenNameFirst) {
538                     return join(prefix, name.givenNames, name.middleName, name.familyName,
539                             name.suffix, true, false, true);
540                 } else {
541                     return join(prefix, name.familyName, name.givenNames, name.middleName,
542                             name.suffix, true, true, true);
543                 }
544         }
545     }
546 
547     /**
548      * Concatenates components of the phonetic name following the CJK tradition:
549      * family name + middle name + given name(s).
550      */
joinPhoneticName(Name name)551     public String joinPhoneticName(Name name) {
552         return join(null, name.phoneticFamilyName,
553                 name.phoneticMiddleName, name.phoneticGivenName, null, true, false, false);
554     }
555 
556     /**
557      * Concatenates parts of a full name inserting spaces and commas as specified.
558      */
join(String prefix, String part1, String part2, String part3, String suffix, boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3)559     private String join(String prefix, String part1, String part2, String part3, String suffix,
560             boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) {
561         prefix = prefix == null ? null: prefix.trim();
562         part1 = part1 == null ? null: part1.trim();
563         part2 = part2 == null ? null: part2.trim();
564         part3 = part3 == null ? null: part3.trim();
565         suffix = suffix == null ? null: suffix.trim();
566 
567         boolean hasPrefix = !TextUtils.isEmpty(prefix);
568         boolean hasPart1 = !TextUtils.isEmpty(part1);
569         boolean hasPart2 = !TextUtils.isEmpty(part2);
570         boolean hasPart3 = !TextUtils.isEmpty(part3);
571         boolean hasSuffix = !TextUtils.isEmpty(suffix);
572 
573         boolean isSingleWord = true;
574         String singleWord = null;
575 
576         if (hasPrefix) {
577             singleWord = prefix;
578         }
579 
580         if (hasPart1) {
581             if (singleWord != null) {
582                 isSingleWord = false;
583             } else {
584                 singleWord = part1;
585             }
586         }
587 
588         if (hasPart2) {
589             if (singleWord != null) {
590                 isSingleWord = false;
591             } else {
592                 singleWord = part2;
593             }
594         }
595 
596         if (hasPart3) {
597             if (singleWord != null) {
598                 isSingleWord = false;
599             } else {
600                 singleWord = part3;
601             }
602         }
603 
604         if (hasSuffix) {
605             if (singleWord != null) {
606                 isSingleWord = false;
607             } else {
608                 singleWord = normalizedSuffix(suffix);
609             }
610         }
611 
612         if (isSingleWord) {
613             return singleWord;
614         }
615 
616         StringBuilder sb = new StringBuilder();
617 
618         if (hasPrefix) {
619             sb.append(prefix);
620         }
621 
622         if (hasPart1) {
623             if (hasPrefix) {
624                 sb.append(' ');
625             }
626             sb.append(part1);
627         }
628 
629         if (hasPart2) {
630             if (hasPrefix || hasPart1) {
631                 if (useCommaAfterPart1) {
632                     sb.append(',');
633                 }
634                 if (useSpace) {
635                     sb.append(' ');
636                 }
637             }
638             sb.append(part2);
639         }
640 
641         if (hasPart3) {
642             if (hasPrefix || hasPart1 || hasPart2) {
643                 if (useSpace) {
644                     sb.append(' ');
645                 }
646             }
647             sb.append(part3);
648         }
649 
650         if (hasSuffix) {
651             if (hasPrefix || hasPart1 || hasPart2 || hasPart3) {
652                 if (useCommaAfterPart3) {
653                     sb.append(',');
654                 }
655                 if (useSpace) {
656                     sb.append(' ');
657                 }
658             }
659             sb.append(normalizedSuffix(suffix));
660         }
661 
662         return sb.toString();
663     }
664 
665     /**
666      * Puts a dot after the supplied suffix if that is the accepted form of the suffix,
667      * e.g. "Jr." and "Sr.", but not "I", "II" and "III".
668      */
normalizedSuffix(String suffix)669     private String normalizedSuffix(String suffix) {
670         int length = suffix.length();
671         if (length == 0 || suffix.charAt(length - 1) == '.') {
672             return suffix;
673         }
674 
675         String withDot = suffix + '.';
676         if (mSuffixesSet.contains(withDot.toUpperCase())) {
677             return withDot;
678         } else {
679             return suffix;
680         }
681     }
682 
683     /**
684      * If the supplied name style is undefined, returns a default based on the language,
685      * otherwise returns the supplied name style itself.
686      *
687      * @param nameStyle See {@link FullNameStyle}.
688      */
getAdjustedFullNameStyle(int nameStyle)689     public int getAdjustedFullNameStyle(int nameStyle) {
690         if (nameStyle == FullNameStyle.UNDEFINED) {
691             if (JAPANESE_LANGUAGE.equals(mLanguage)) {
692                 return FullNameStyle.JAPANESE;
693             } else if (KOREAN_LANGUAGE.equals(mLanguage)) {
694                 return FullNameStyle.KOREAN;
695             } else if (CHINESE_LANGUAGE.equals(mLanguage)) {
696                 return FullNameStyle.CHINESE;
697             } else {
698                 return FullNameStyle.WESTERN;
699             }
700         } else if (nameStyle == FullNameStyle.CJK) {
701             if (JAPANESE_LANGUAGE.equals(mLanguage)) {
702                 return FullNameStyle.JAPANESE;
703             } else if (KOREAN_LANGUAGE.equals(mLanguage)) {
704                 return FullNameStyle.KOREAN;
705             } else {
706                 return FullNameStyle.CHINESE;
707             }
708         }
709         return nameStyle;
710     }
711 
712     /**
713      * Parses the first word from the name if it is a prefix.
714      */
parsePrefix(Name name, NameTokenizer tokens)715     private void parsePrefix(Name name, NameTokenizer tokens) {
716         if (tokens.mStartPointer == tokens.mEndPointer) {
717             return;
718         }
719 
720         String firstToken = tokens.mTokens[tokens.mStartPointer];
721         if (mPrefixesSet.contains(firstToken.toUpperCase())) {
722             if (tokens.hasDot(tokens.mStartPointer)) {
723                 firstToken += '.';
724             }
725             name.prefix = firstToken;
726             tokens.mStartPointer++;
727         }
728     }
729 
730     /**
731      * Parses the last word(s) from the name if it is a suffix.
732      */
parseSuffix(Name name, NameTokenizer tokens)733     private void parseSuffix(Name name, NameTokenizer tokens) {
734         if (tokens.mStartPointer == tokens.mEndPointer) {
735             return;
736         }
737 
738         String lastToken = tokens.mTokens[tokens.mEndPointer - 1];
739 
740         // Take care of an explicit comma-separated suffix
741         if (tokens.mEndPointer - tokens.mStartPointer > 2
742                 && tokens.hasComma(tokens.mEndPointer - 2)) {
743             if (tokens.hasDot(tokens.mEndPointer - 1)) {
744                 lastToken += '.';
745             }
746             name.suffix = lastToken;
747             tokens.mEndPointer--;
748             return;
749         }
750 
751         if (lastToken.length() > mMaxSuffixLength) {
752             return;
753         }
754 
755         String normalized = lastToken.toUpperCase();
756         if (mSuffixesSet.contains(normalized)) {
757             name.suffix = lastToken;
758             tokens.mEndPointer--;
759             return;
760         }
761 
762         if (tokens.hasDot(tokens.mEndPointer - 1)) {
763             lastToken += '.';
764         }
765         normalized += ".";
766 
767         // Take care of suffixes like M.D. and D.D.S.
768         int pos = tokens.mEndPointer - 1;
769         while (normalized.length() <= mMaxSuffixLength) {
770 
771             if (mSuffixesSet.contains(normalized)) {
772                 name.suffix = lastToken;
773                 tokens.mEndPointer = pos;
774                 return;
775             }
776 
777             if (pos == tokens.mStartPointer) {
778                 break;
779             }
780 
781             pos--;
782             if (tokens.hasDot(pos)) {
783                 lastToken = tokens.mTokens[pos] + "." + lastToken;
784             } else {
785                 lastToken = tokens.mTokens[pos] + " " + lastToken;
786             }
787 
788             normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized;
789         }
790     }
791 
parseLastName(Name name, NameTokenizer tokens)792     private void parseLastName(Name name, NameTokenizer tokens) {
793         if (tokens.mStartPointer == tokens.mEndPointer) {
794             return;
795         }
796 
797         // If the first word is followed by a comma, assume that it's the family name
798         if (tokens.hasComma(tokens.mStartPointer)) {
799            name.familyName = tokens.mTokens[tokens.mStartPointer];
800            tokens.mStartPointer++;
801            return;
802         }
803 
804         // If the second word is followed by a comma and the first word
805         // is a last name prefix as in "de Sade" and "von Cliburn", treat
806         // the first two words as the family name.
807         if (tokens.mStartPointer + 1 < tokens.mEndPointer
808                 && tokens.hasComma(tokens.mStartPointer + 1)
809                 && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) {
810             String familyNamePrefix = tokens.mTokens[tokens.mStartPointer];
811             if (tokens.hasDot(tokens.mStartPointer)) {
812                 familyNamePrefix += '.';
813             }
814             name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1];
815             tokens.mStartPointer += 2;
816             return;
817         }
818 
819         // Finally, assume that the last word is the last name
820         name.familyName = tokens.mTokens[tokens.mEndPointer - 1];
821         tokens.mEndPointer--;
822 
823         // Take care of last names like "de Sade" and "von Cliburn"
824         if ((tokens.mEndPointer - tokens.mStartPointer) > 0) {
825             String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1];
826             if (isFamilyNamePrefix(lastNamePrefix)) {
827                 if (tokens.hasDot(tokens.mEndPointer - 1)) {
828                     lastNamePrefix += '.';
829                 }
830                 name.familyName = lastNamePrefix + " " + name.familyName;
831                 tokens.mEndPointer--;
832             }
833         }
834     }
835 
836     /**
837      * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de"
838      */
isFamilyNamePrefix(String word)839     private boolean isFamilyNamePrefix(String word) {
840         final String normalized = word.toUpperCase();
841 
842         return mLastNamePrefixesSet.contains(normalized)
843                 || mLastNamePrefixesSet.contains(normalized + ".");
844     }
845 
846 
parseMiddleName(Name name, NameTokenizer tokens)847     private void parseMiddleName(Name name, NameTokenizer tokens) {
848         if (tokens.mStartPointer == tokens.mEndPointer) {
849             return;
850         }
851 
852         if ((tokens.mEndPointer - tokens.mStartPointer) > 1) {
853             if ((tokens.mEndPointer - tokens.mStartPointer) == 2
854                     || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2].
855                             toUpperCase())) {
856                 name.middleName = tokens.mTokens[tokens.mEndPointer - 1];
857                 if (tokens.hasDot(tokens.mEndPointer - 1)) {
858                     name.middleName += '.';
859                 }
860                 tokens.mEndPointer--;
861             }
862         }
863     }
864 
parseGivenNames(Name name, NameTokenizer tokens)865     private void parseGivenNames(Name name, NameTokenizer tokens) {
866         if (tokens.mStartPointer == tokens.mEndPointer) {
867             return;
868         }
869 
870         if ((tokens.mEndPointer - tokens.mStartPointer) == 1) {
871             name.givenNames = tokens.mTokens[tokens.mStartPointer];
872         } else {
873             StringBuilder sb = new StringBuilder();
874             for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) {
875                 if (i != tokens.mStartPointer) {
876                     sb.append(' ');
877                 }
878                 sb.append(tokens.mTokens[i]);
879                 if (tokens.hasDot(i)) {
880                     sb.append('.');
881                 }
882             }
883             name.givenNames = sb.toString();
884         }
885     }
886 
887     /**
888      * Makes the best guess at the expected full name style based on the character set
889      * used in the supplied name.  If the phonetic name is also supplied, tries to
890      * differentiate between Chinese, Japanese and Korean based on the alphabet used
891      * for the phonetic name.
892      */
guessNameStyle(Name name)893     public void guessNameStyle(Name name) {
894         guessFullNameStyle(name);
895         guessPhoneticNameStyle(name);
896         name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle,
897                 name.phoneticNameStyle);
898     }
899 
900     /**
901      * Updates the display name style according to the phonetic name style if we
902      * were unsure about display name style based on the name components, but
903      * phonetic name makes it more definitive.
904      */
getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle)905     public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) {
906         if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) {
907             if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) {
908                 if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) {
909                     return FullNameStyle.JAPANESE;
910                 } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) {
911                     return FullNameStyle.KOREAN;
912                 }
913                 if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) {
914                     return FullNameStyle.CHINESE;
915                 }
916             }
917         }
918         return nameStyle;
919     }
920 
921     /**
922      * Makes the best guess at the expected full name style based on the character set
923      * used in the supplied name.
924      */
guessFullNameStyle(NameSplitter.Name name)925     private void guessFullNameStyle(NameSplitter.Name name) {
926         if (name.fullNameStyle != FullNameStyle.UNDEFINED) {
927             return;
928         }
929 
930         int bestGuess = guessFullNameStyle(name.givenNames);
931         // A mix of Hanzi and latin chars are common in China, so we have to go through all names
932         // if the name is not JANPANESE or KOREAN.
933         if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK
934                 && bestGuess != FullNameStyle.WESTERN) {
935             name.fullNameStyle = bestGuess;
936             return;
937         }
938 
939         int guess = guessFullNameStyle(name.familyName);
940         if (guess != FullNameStyle.UNDEFINED) {
941             if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
942                 name.fullNameStyle = guess;
943                 return;
944             }
945             bestGuess = guess;
946         }
947 
948         guess = guessFullNameStyle(name.middleName);
949         if (guess != FullNameStyle.UNDEFINED) {
950             if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
951                 name.fullNameStyle = guess;
952                 return;
953             }
954             bestGuess = guess;
955         }
956 
957         guess = guessFullNameStyle(name.prefix);
958         if (guess != FullNameStyle.UNDEFINED) {
959             if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
960                 name.fullNameStyle = guess;
961                 return;
962             }
963             bestGuess = guess;
964         }
965 
966         guess = guessFullNameStyle(name.suffix);
967         if (guess != FullNameStyle.UNDEFINED) {
968             if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
969                 name.fullNameStyle = guess;
970                 return;
971             }
972             bestGuess = guess;
973         }
974 
975         name.fullNameStyle = bestGuess;
976     }
977 
guessFullNameStyle(String name)978     public int guessFullNameStyle(String name) {
979         if (name == null) {
980             return FullNameStyle.UNDEFINED;
981         }
982 
983         int nameStyle = FullNameStyle.UNDEFINED;
984         int length = name.length();
985         int offset = 0;
986         while (offset < length) {
987             int codePoint = Character.codePointAt(name, offset);
988             if (Character.isLetter(codePoint)) {
989                 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
990 
991                 if (!isLatinUnicodeBlock(unicodeBlock)) {
992 
993                     if (isCJKUnicodeBlock(unicodeBlock)) {
994                         // We don't know if this is Chinese, Japanese or Korean -
995                         // trying to figure out by looking at other characters in the name
996                         return guessCJKNameStyle(name, offset + Character.charCount(codePoint));
997                     }
998 
999                     if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
1000                         return FullNameStyle.JAPANESE;
1001                     }
1002 
1003                     if (isKoreanUnicodeBlock(unicodeBlock)) {
1004                         return FullNameStyle.KOREAN;
1005                     }
1006                 }
1007                 nameStyle = FullNameStyle.WESTERN;
1008             }
1009             offset += Character.charCount(codePoint);
1010         }
1011         return nameStyle;
1012     }
1013 
guessCJKNameStyle(String name, int offset)1014     private int guessCJKNameStyle(String name, int offset) {
1015         int length = name.length();
1016         while (offset < length) {
1017             int codePoint = Character.codePointAt(name, offset);
1018             if (Character.isLetter(codePoint)) {
1019                 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
1020                 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
1021                     return FullNameStyle.JAPANESE;
1022                 }
1023                 if (isKoreanUnicodeBlock(unicodeBlock)) {
1024                     return FullNameStyle.KOREAN;
1025                 }
1026             }
1027             offset += Character.charCount(codePoint);
1028         }
1029 
1030         return FullNameStyle.CJK;
1031     }
1032 
guessPhoneticNameStyle(NameSplitter.Name name)1033     private void guessPhoneticNameStyle(NameSplitter.Name name) {
1034         if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) {
1035             return;
1036         }
1037 
1038         int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName);
1039         if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) {
1040             name.phoneticNameStyle = bestGuess;
1041             return;
1042         }
1043 
1044         int guess = guessPhoneticNameStyle(name.phoneticGivenName);
1045         if (guess != FullNameStyle.UNDEFINED) {
1046             if (guess != FullNameStyle.CJK) {
1047                 name.phoneticNameStyle = guess;
1048                 return;
1049             }
1050             bestGuess = guess;
1051         }
1052 
1053         guess = guessPhoneticNameStyle(name.phoneticMiddleName);
1054         if (guess != FullNameStyle.UNDEFINED) {
1055             if (guess != FullNameStyle.CJK) {
1056                 name.phoneticNameStyle = guess;
1057                 return;
1058             }
1059             bestGuess = guess;
1060         }
1061     }
1062 
guessPhoneticNameStyle(String name)1063     public int guessPhoneticNameStyle(String name) {
1064         if (name == null) {
1065             return PhoneticNameStyle.UNDEFINED;
1066         }
1067 
1068         int nameStyle = PhoneticNameStyle.UNDEFINED;
1069         int length = name.length();
1070         int offset = 0;
1071         while (offset < length) {
1072             int codePoint = Character.codePointAt(name, offset);
1073             if (Character.isLetter(codePoint)) {
1074                 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
1075                 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
1076                     return PhoneticNameStyle.JAPANESE;
1077                 }
1078                 if (isKoreanUnicodeBlock(unicodeBlock)) {
1079                     return PhoneticNameStyle.KOREAN;
1080                 }
1081                 if (isLatinUnicodeBlock(unicodeBlock)) {
1082                     return PhoneticNameStyle.PINYIN;
1083                 }
1084             }
1085             offset += Character.charCount(codePoint);
1086         }
1087 
1088         return nameStyle;
1089     }
1090 
isLatinUnicodeBlock(UnicodeBlock unicodeBlock)1091     private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) {
1092         return unicodeBlock == UnicodeBlock.BASIC_LATIN ||
1093                 unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT ||
1094                 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A ||
1095                 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B ||
1096                 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL;
1097     }
1098 
isCJKUnicodeBlock(UnicodeBlock block)1099     private static boolean isCJKUnicodeBlock(UnicodeBlock block) {
1100         return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
1101                 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
1102                 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
1103                 || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
1104                 || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT
1105                 || block == UnicodeBlock.CJK_COMPATIBILITY
1106                 || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS
1107                 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
1108                 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT;
1109     }
1110 
isKoreanUnicodeBlock(UnicodeBlock unicodeBlock)1111     private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) {
1112         return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES ||
1113                 unicodeBlock == UnicodeBlock.HANGUL_JAMO ||
1114                 unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO;
1115     }
1116 
isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock)1117     private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) {
1118         return unicodeBlock == UnicodeBlock.KATAKANA ||
1119                 unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS ||
1120                 unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS ||
1121                 unicodeBlock == UnicodeBlock.HIRAGANA;
1122     }
1123 }
1124