/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License */ package com.android.providers.contacts; import android.content.ContentValues; import android.provider.ContactsContract.CommonDataKinds.StructuredName; import android.provider.ContactsContract.FullNameStyle; import android.provider.ContactsContract.PhoneticNameStyle; import android.text.TextUtils; import com.android.providers.contacts.util.NeededForTesting; import java.lang.Character.UnicodeBlock; import java.util.HashSet; import java.util.Locale; import java.util.StringTokenizer; /** * The purpose of this class is to split a full name into given names and last * name. The logic only supports having a single last name. If the full name has * multiple last names the output will be incorrect. *

* Core algorithm: *

    *
  1. Remove the suffixes (III, Ph.D., M.D.).
  2. *
  3. Remove the prefixes (Mr., Pastor, Reverend, Sir).
  4. *
  5. Assign the last remaining token as the last name.
  6. *
  7. If the previous word to the last name is one from LASTNAME_PREFIXES, use * this word also as the last name.
  8. *
  9. Assign the rest of the words as the "given names".
  10. *
*/ public class NameSplitter { public static final int MAX_TOKENS = 10; private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase(); private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase(); // This includes simplified and traditional Chinese private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase(); private final HashSet mPrefixesSet; private final HashSet mSuffixesSet; private final int mMaxSuffixLength; private final HashSet mLastNamePrefixesSet; private final HashSet mConjuctions; private final Locale mLocale; private final String mLanguage; /** * Two-Chracter long Korean family names. * http://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EB%B3%B5%EC%84%B1 */ private static final String[] KOREAN_TWO_CHARCTER_FAMILY_NAMES = { "\uAC15\uC804", // Gang Jeon "\uB0A8\uAD81", // Nam Goong "\uB3C5\uACE0", // Dok Go "\uB3D9\uBC29", // Dong Bang "\uB9DD\uC808", // Mang Jeol "\uC0AC\uACF5", // Sa Gong "\uC11C\uBB38", // Seo Moon "\uC120\uC6B0", // Seon Woo "\uC18C\uBD09", // So Bong "\uC5B4\uAE08", // Uh Geum "\uC7A5\uACE1", // Jang Gok "\uC81C\uAC08", // Je Gal "\uD669\uBCF4" // Hwang Bo }; public static class Name { public String prefix; public String givenNames; public String middleName; public String familyName; public String suffix; public int fullNameStyle; public String phoneticFamilyName; public String phoneticMiddleName; public String phoneticGivenName; public int phoneticNameStyle; public Name() { } public Name(String prefix, String givenNames, String middleName, String familyName, String suffix) { this.prefix = prefix; this.givenNames = givenNames; this.middleName = middleName; this.familyName = familyName; this.suffix = suffix; } @NeededForTesting public String getPrefix() { return prefix; } public String getGivenNames() { return givenNames; } public String getMiddleName() { return middleName; } public String getFamilyName() { return familyName; } @NeededForTesting public String getSuffix() { return suffix; } public int getFullNameStyle() { return fullNameStyle; } public String getPhoneticFamilyName() { return phoneticFamilyName; } public String getPhoneticMiddleName() { return phoneticMiddleName; } public String getPhoneticGivenName() { return phoneticGivenName; } public int getPhoneticNameStyle() { return phoneticNameStyle; } public void fromValues(ContentValues values) { prefix = values.getAsString(StructuredName.PREFIX); givenNames = values.getAsString(StructuredName.GIVEN_NAME); middleName = values.getAsString(StructuredName.MIDDLE_NAME); familyName = values.getAsString(StructuredName.FAMILY_NAME); suffix = values.getAsString(StructuredName.SUFFIX); Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE); fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer; phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME); phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME); phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME); integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE); phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer; } public void toValues(ContentValues values) { putValueIfPresent(values, StructuredName.PREFIX, prefix); putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames); putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName); putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName); putValueIfPresent(values, StructuredName.SUFFIX, suffix); values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle); putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName); putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName); putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName); values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle); } private void putValueIfPresent(ContentValues values, String name, String value) { if (value != null) { values.put(name, value); } } public void clear() { prefix = null; givenNames = null; middleName = null; familyName = null; suffix = null; fullNameStyle = FullNameStyle.UNDEFINED; phoneticFamilyName = null; phoneticMiddleName = null; phoneticGivenName = null; phoneticNameStyle = PhoneticNameStyle.UNDEFINED; } public boolean isEmpty() { return TextUtils.isEmpty(givenNames) && TextUtils.isEmpty(middleName) && TextUtils.isEmpty(familyName) && TextUtils.isEmpty(suffix) && TextUtils.isEmpty(phoneticFamilyName) && TextUtils.isEmpty(phoneticMiddleName) && TextUtils.isEmpty(phoneticGivenName); } @Override public String toString() { return "[prefix: " + prefix + " given: " + givenNames + " middle: " + middleName + " family: " + familyName + " suffix: " + suffix + " ph/given: " + phoneticGivenName + " ph/middle: " + phoneticMiddleName + " ph/family: " + phoneticFamilyName + "]"; } } private static class NameTokenizer extends StringTokenizer { private final String[] mTokens; private int mDotBitmask; private int mCommaBitmask; private int mStartPointer; private int mEndPointer; public NameTokenizer(String fullName) { super(fullName, " .,", true); mTokens = new String[MAX_TOKENS]; // Iterate over tokens, skipping over empty ones and marking tokens that // are followed by dots. while (hasMoreTokens() && mEndPointer < MAX_TOKENS) { final String token = nextToken(); if (token.length() > 0) { final char c = token.charAt(0); if (c == ' ') { continue; } } if (mEndPointer > 0 && token.charAt(0) == '.') { mDotBitmask |= (1 << (mEndPointer - 1)); } else if (mEndPointer > 0 && token.charAt(0) == ',') { mCommaBitmask |= (1 << (mEndPointer - 1)); } else { mTokens[mEndPointer] = token; mEndPointer++; } } } /** * Returns true if the token is followed by a dot in the original full name. */ public boolean hasDot(int index) { return (mDotBitmask & (1 << index)) != 0; } /** * Returns true if the token is followed by a comma in the original full name. */ public boolean hasComma(int index) { return (mCommaBitmask & (1 << index)) != 0; } } /** * Constructor. * * @param commonPrefixes comma-separated list of common prefixes, * e.g. "Mr, Ms, Mrs" * @param commonLastNamePrefixes comma-separated list of common last name prefixes, * e.g. "d', st, st., von" * @param commonSuffixes comma-separated list of common suffixes, * e.g. "Jr, M.D., MD, D.D.S." * @param commonConjunctions comma-separated list of common conjuctions, * e.g. "AND, Or" */ public NameSplitter(String commonPrefixes, String commonLastNamePrefixes, String commonSuffixes, String commonConjunctions, Locale locale) { // TODO: refactor this to use resources mPrefixesSet = convertToSet(commonPrefixes); mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes); mSuffixesSet = convertToSet(commonSuffixes); mConjuctions = convertToSet(commonConjunctions); mLocale = locale != null ? locale : Locale.getDefault(); mLanguage = mLocale.getLanguage().toLowerCase(); int maxLength = 0; for (String suffix : mSuffixesSet) { if (suffix.length() > maxLength) { maxLength = suffix.length(); } } mMaxSuffixLength = maxLength; } /** * Converts a comma-separated list of Strings to a set of Strings. Trims strings * and converts them to upper case. */ private static HashSet convertToSet(String strings) { HashSet set = new HashSet(); if (strings != null) { String[] split = strings.split(","); for (int i = 0; i < split.length; i++) { set.add(split[i].trim().toUpperCase()); } } return set; } /** * Parses a full name and returns components as a list of tokens. */ public int tokenize(String[] tokens, String fullName) { if (fullName == null) { return 0; } NameTokenizer tokenizer = new NameTokenizer(fullName); if (tokenizer.mStartPointer == tokenizer.mEndPointer) { return 0; } String firstToken = tokenizer.mTokens[tokenizer.mStartPointer]; int count = 0; for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) { tokens[count++] = tokenizer.mTokens[i]; } return count; } /** * Parses a full name and returns parsed components in the Name object. */ public void split(Name name, String fullName) { if (fullName == null) { return; } int fullNameStyle = guessFullNameStyle(fullName); if (fullNameStyle == FullNameStyle.CJK) { fullNameStyle = getAdjustedFullNameStyle(fullNameStyle); } split(name, fullName, fullNameStyle); } /** * Parses a full name and returns parsed components in the Name object * with a given fullNameStyle. */ public void split(Name name, String fullName, int fullNameStyle) { if (fullName == null) { return; } name.fullNameStyle = fullNameStyle; switch (fullNameStyle) { case FullNameStyle.CHINESE: splitChineseName(name, fullName); break; case FullNameStyle.JAPANESE: splitJapaneseName(name, fullName); break; case FullNameStyle.KOREAN: splitKoreanName(name, fullName); break; default: splitWesternName(name, fullName); } } /** * Splits a full name composed according to the Western tradition: *
     *   [prefix] given name(s) [[middle name] family name] [, suffix]
     *   [prefix] family name, given name [middle name] [,suffix]
     * 
*/ private void splitWesternName(Name name, String fullName) { NameTokenizer tokens = new NameTokenizer(fullName); parsePrefix(name, tokens); // If the name consists of just one or two tokens, treat them as first/last name, // not as suffix. Example: John Ma; Ma is last name, not "M.A.". if (tokens.mEndPointer > 2) { parseSuffix(name, tokens); } if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) { name.givenNames = tokens.mTokens[tokens.mStartPointer]; } else { parseLastName(name, tokens); parseMiddleName(name, tokens); parseGivenNames(name, tokens); } } /** * Splits a full name composed according to the Chinese tradition: *
     *   [family name [middle name]] given name
     * 
*/ private void splitChineseName(Name name, String fullName) { StringTokenizer tokenizer = new StringTokenizer(fullName); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (name.givenNames == null) { name.givenNames = token; } else if (name.familyName == null) { name.familyName = name.givenNames; name.givenNames = token; } else if (name.middleName == null) { name.middleName = name.givenNames; name.givenNames = token; } else { name.middleName = name.middleName + name.givenNames; name.givenNames = token; } } // If a single word parse that word up. if (name.givenNames != null && name.familyName == null && name.middleName == null) { int length = fullName.length(); if (length == 2) { name.familyName = fullName.substring(0, 1); name.givenNames = fullName.substring(1); } else if (length == 3) { name.familyName = fullName.substring(0, 1); name.middleName = fullName.substring(1, 2); name.givenNames = fullName.substring(2); } else if (length == 4) { name.familyName = fullName.substring(0, 2); name.middleName = fullName.substring(2, 3); name.givenNames = fullName.substring(3); } } } /** * Splits a full name composed according to the Japanese tradition: *
     *   [family name] given name(s)
     * 
*/ private void splitJapaneseName(Name name, String fullName) { StringTokenizer tokenizer = new StringTokenizer(fullName); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (name.givenNames == null) { name.givenNames = token; } else if (name.familyName == null) { name.familyName = name.givenNames; name.givenNames = token; } else { name.givenNames += " " + token; } } } /** * Splits a full name composed according to the Korean tradition: *
     *   [family name] given name(s)
     * 
*/ private void splitKoreanName(Name name, String fullName) { StringTokenizer tokenizer = new StringTokenizer(fullName); if (tokenizer.countTokens() > 1) { // Each name can be identified by separators. while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (name.givenNames == null) { name.givenNames = token; } else if (name.familyName == null) { name.familyName = name.givenNames; name.givenNames = token; } else { name.givenNames += " " + token; } } } else { // There is no separator. Try to guess family name. // The length of most family names is 1. int familyNameLength = 1; // Compare with 2-length family names. for (String twoLengthFamilyName : KOREAN_TWO_CHARCTER_FAMILY_NAMES) { if (fullName.startsWith(twoLengthFamilyName)) { familyNameLength = 2; break; } } name.familyName = fullName.substring(0, familyNameLength); if (fullName.length() > familyNameLength) { name.givenNames = fullName.substring(familyNameLength); } } } /** * Concatenates components of a name according to the rules dictated by the name style. * * @param givenNameFirst is ignored for CJK display name styles */ public String join(Name name, boolean givenNameFirst, boolean includePrefix) { String prefix = includePrefix ? name.prefix : null; switch (name.fullNameStyle) { case FullNameStyle.CJK: case FullNameStyle.CHINESE: case FullNameStyle.KOREAN: return join(prefix, name.familyName, name.middleName, name.givenNames, name.suffix, false, false, false); case FullNameStyle.JAPANESE: return join(prefix, name.familyName, name.middleName, name.givenNames, name.suffix, true, false, false); default: if (givenNameFirst) { return join(prefix, name.givenNames, name.middleName, name.familyName, name.suffix, true, false, true); } else { return join(prefix, name.familyName, name.givenNames, name.middleName, name.suffix, true, true, true); } } } /** * Concatenates components of the phonetic name following the CJK tradition: * family name + middle name + given name(s). */ public String joinPhoneticName(Name name) { return join(null, name.phoneticFamilyName, name.phoneticMiddleName, name.phoneticGivenName, null, true, false, false); } /** * Concatenates parts of a full name inserting spaces and commas as specified. */ private String join(String prefix, String part1, String part2, String part3, String suffix, boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) { prefix = prefix == null ? null: prefix.trim(); part1 = part1 == null ? null: part1.trim(); part2 = part2 == null ? null: part2.trim(); part3 = part3 == null ? null: part3.trim(); suffix = suffix == null ? null: suffix.trim(); boolean hasPrefix = !TextUtils.isEmpty(prefix); boolean hasPart1 = !TextUtils.isEmpty(part1); boolean hasPart2 = !TextUtils.isEmpty(part2); boolean hasPart3 = !TextUtils.isEmpty(part3); boolean hasSuffix = !TextUtils.isEmpty(suffix); boolean isSingleWord = true; String singleWord = null; if (hasPrefix) { singleWord = prefix; } if (hasPart1) { if (singleWord != null) { isSingleWord = false; } else { singleWord = part1; } } if (hasPart2) { if (singleWord != null) { isSingleWord = false; } else { singleWord = part2; } } if (hasPart3) { if (singleWord != null) { isSingleWord = false; } else { singleWord = part3; } } if (hasSuffix) { if (singleWord != null) { isSingleWord = false; } else { singleWord = normalizedSuffix(suffix); } } if (isSingleWord) { return singleWord; } StringBuilder sb = new StringBuilder(); if (hasPrefix) { sb.append(prefix); } if (hasPart1) { if (hasPrefix) { sb.append(' '); } sb.append(part1); } if (hasPart2) { if (hasPrefix || hasPart1) { if (useCommaAfterPart1) { sb.append(','); } if (useSpace) { sb.append(' '); } } sb.append(part2); } if (hasPart3) { if (hasPrefix || hasPart1 || hasPart2) { if (useSpace) { sb.append(' '); } } sb.append(part3); } if (hasSuffix) { if (hasPrefix || hasPart1 || hasPart2 || hasPart3) { if (useCommaAfterPart3) { sb.append(','); } if (useSpace) { sb.append(' '); } } sb.append(normalizedSuffix(suffix)); } return sb.toString(); } /** * Puts a dot after the supplied suffix if that is the accepted form of the suffix, * e.g. "Jr." and "Sr.", but not "I", "II" and "III". */ private String normalizedSuffix(String suffix) { int length = suffix.length(); if (length == 0 || suffix.charAt(length - 1) == '.') { return suffix; } String withDot = suffix + '.'; if (mSuffixesSet.contains(withDot.toUpperCase())) { return withDot; } else { return suffix; } } /** * If the supplied name style is undefined, returns a default based on the language, * otherwise returns the supplied name style itself. * * @param nameStyle See {@link FullNameStyle}. */ public int getAdjustedFullNameStyle(int nameStyle) { if (nameStyle == FullNameStyle.UNDEFINED) { if (JAPANESE_LANGUAGE.equals(mLanguage)) { return FullNameStyle.JAPANESE; } else if (KOREAN_LANGUAGE.equals(mLanguage)) { return FullNameStyle.KOREAN; } else if (CHINESE_LANGUAGE.equals(mLanguage)) { return FullNameStyle.CHINESE; } else { return FullNameStyle.WESTERN; } } else if (nameStyle == FullNameStyle.CJK) { if (JAPANESE_LANGUAGE.equals(mLanguage)) { return FullNameStyle.JAPANESE; } else if (KOREAN_LANGUAGE.equals(mLanguage)) { return FullNameStyle.KOREAN; } else { return FullNameStyle.CHINESE; } } return nameStyle; } /** * Parses the first word from the name if it is a prefix. */ private void parsePrefix(Name name, NameTokenizer tokens) { if (tokens.mStartPointer == tokens.mEndPointer) { return; } String firstToken = tokens.mTokens[tokens.mStartPointer]; if (mPrefixesSet.contains(firstToken.toUpperCase())) { if (tokens.hasDot(tokens.mStartPointer)) { firstToken += '.'; } name.prefix = firstToken; tokens.mStartPointer++; } } /** * Parses the last word(s) from the name if it is a suffix. */ private void parseSuffix(Name name, NameTokenizer tokens) { if (tokens.mStartPointer == tokens.mEndPointer) { return; } String lastToken = tokens.mTokens[tokens.mEndPointer - 1]; // Take care of an explicit comma-separated suffix if (tokens.mEndPointer - tokens.mStartPointer > 2 && tokens.hasComma(tokens.mEndPointer - 2)) { if (tokens.hasDot(tokens.mEndPointer - 1)) { lastToken += '.'; } name.suffix = lastToken; tokens.mEndPointer--; return; } if (lastToken.length() > mMaxSuffixLength) { return; } String normalized = lastToken.toUpperCase(); if (mSuffixesSet.contains(normalized)) { name.suffix = lastToken; tokens.mEndPointer--; return; } if (tokens.hasDot(tokens.mEndPointer - 1)) { lastToken += '.'; } normalized += "."; // Take care of suffixes like M.D. and D.D.S. int pos = tokens.mEndPointer - 1; while (normalized.length() <= mMaxSuffixLength) { if (mSuffixesSet.contains(normalized)) { name.suffix = lastToken; tokens.mEndPointer = pos; return; } if (pos == tokens.mStartPointer) { break; } pos--; if (tokens.hasDot(pos)) { lastToken = tokens.mTokens[pos] + "." + lastToken; } else { lastToken = tokens.mTokens[pos] + " " + lastToken; } normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized; } } private void parseLastName(Name name, NameTokenizer tokens) { if (tokens.mStartPointer == tokens.mEndPointer) { return; } // If the first word is followed by a comma, assume that it's the family name if (tokens.hasComma(tokens.mStartPointer)) { name.familyName = tokens.mTokens[tokens.mStartPointer]; tokens.mStartPointer++; return; } // If the second word is followed by a comma and the first word // is a last name prefix as in "de Sade" and "von Cliburn", treat // the first two words as the family name. if (tokens.mStartPointer + 1 < tokens.mEndPointer && tokens.hasComma(tokens.mStartPointer + 1) && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) { String familyNamePrefix = tokens.mTokens[tokens.mStartPointer]; if (tokens.hasDot(tokens.mStartPointer)) { familyNamePrefix += '.'; } name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1]; tokens.mStartPointer += 2; return; } // Finally, assume that the last word is the last name name.familyName = tokens.mTokens[tokens.mEndPointer - 1]; tokens.mEndPointer--; // Take care of last names like "de Sade" and "von Cliburn" if ((tokens.mEndPointer - tokens.mStartPointer) > 0) { String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1]; if (isFamilyNamePrefix(lastNamePrefix)) { if (tokens.hasDot(tokens.mEndPointer - 1)) { lastNamePrefix += '.'; } name.familyName = lastNamePrefix + " " + name.familyName; tokens.mEndPointer--; } } } /** * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de" */ private boolean isFamilyNamePrefix(String word) { final String normalized = word.toUpperCase(); return mLastNamePrefixesSet.contains(normalized) || mLastNamePrefixesSet.contains(normalized + "."); } private void parseMiddleName(Name name, NameTokenizer tokens) { if (tokens.mStartPointer == tokens.mEndPointer) { return; } if ((tokens.mEndPointer - tokens.mStartPointer) > 1) { if ((tokens.mEndPointer - tokens.mStartPointer) == 2 || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2]. toUpperCase())) { name.middleName = tokens.mTokens[tokens.mEndPointer - 1]; if (tokens.hasDot(tokens.mEndPointer - 1)) { name.middleName += '.'; } tokens.mEndPointer--; } } } private void parseGivenNames(Name name, NameTokenizer tokens) { if (tokens.mStartPointer == tokens.mEndPointer) { return; } if ((tokens.mEndPointer - tokens.mStartPointer) == 1) { name.givenNames = tokens.mTokens[tokens.mStartPointer]; } else { StringBuilder sb = new StringBuilder(); for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) { if (i != tokens.mStartPointer) { sb.append(' '); } sb.append(tokens.mTokens[i]); if (tokens.hasDot(i)) { sb.append('.'); } } name.givenNames = sb.toString(); } } /** * Makes the best guess at the expected full name style based on the character set * used in the supplied name. If the phonetic name is also supplied, tries to * differentiate between Chinese, Japanese and Korean based on the alphabet used * for the phonetic name. */ public void guessNameStyle(Name name) { guessFullNameStyle(name); guessPhoneticNameStyle(name); name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle, name.phoneticNameStyle); } /** * Updates the display name style according to the phonetic name style if we * were unsure about display name style based on the name components, but * phonetic name makes it more definitive. */ public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) { if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) { if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) { return FullNameStyle.JAPANESE; } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) { return FullNameStyle.KOREAN; } if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) { return FullNameStyle.CHINESE; } } } return nameStyle; } /** * Makes the best guess at the expected full name style based on the character set * used in the supplied name. */ private void guessFullNameStyle(NameSplitter.Name name) { if (name.fullNameStyle != FullNameStyle.UNDEFINED) { return; } int bestGuess = guessFullNameStyle(name.givenNames); // A mix of Hanzi and latin chars are common in China, so we have to go through all names // if the name is not JANPANESE or KOREAN. if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK && bestGuess != FullNameStyle.WESTERN) { name.fullNameStyle = bestGuess; return; } int guess = guessFullNameStyle(name.familyName); if (guess != FullNameStyle.UNDEFINED) { if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { name.fullNameStyle = guess; return; } bestGuess = guess; } guess = guessFullNameStyle(name.middleName); if (guess != FullNameStyle.UNDEFINED) { if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { name.fullNameStyle = guess; return; } bestGuess = guess; } guess = guessFullNameStyle(name.prefix); if (guess != FullNameStyle.UNDEFINED) { if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { name.fullNameStyle = guess; return; } bestGuess = guess; } guess = guessFullNameStyle(name.suffix); if (guess != FullNameStyle.UNDEFINED) { if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { name.fullNameStyle = guess; return; } bestGuess = guess; } name.fullNameStyle = bestGuess; } public int guessFullNameStyle(String name) { if (name == null) { return FullNameStyle.UNDEFINED; } int nameStyle = FullNameStyle.UNDEFINED; int length = name.length(); int offset = 0; while (offset < length) { int codePoint = Character.codePointAt(name, offset); if (Character.isLetter(codePoint)) { UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); if (!isLatinUnicodeBlock(unicodeBlock)) { if (isCJKUnicodeBlock(unicodeBlock)) { // We don't know if this is Chinese, Japanese or Korean - // trying to figure out by looking at other characters in the name return guessCJKNameStyle(name, offset + Character.charCount(codePoint)); } if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { return FullNameStyle.JAPANESE; } if (isKoreanUnicodeBlock(unicodeBlock)) { return FullNameStyle.KOREAN; } } nameStyle = FullNameStyle.WESTERN; } offset += Character.charCount(codePoint); } return nameStyle; } private int guessCJKNameStyle(String name, int offset) { int length = name.length(); while (offset < length) { int codePoint = Character.codePointAt(name, offset); if (Character.isLetter(codePoint)) { UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { return FullNameStyle.JAPANESE; } if (isKoreanUnicodeBlock(unicodeBlock)) { return FullNameStyle.KOREAN; } } offset += Character.charCount(codePoint); } return FullNameStyle.CJK; } private void guessPhoneticNameStyle(NameSplitter.Name name) { if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { return; } int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName); if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) { name.phoneticNameStyle = bestGuess; return; } int guess = guessPhoneticNameStyle(name.phoneticGivenName); if (guess != FullNameStyle.UNDEFINED) { if (guess != FullNameStyle.CJK) { name.phoneticNameStyle = guess; return; } bestGuess = guess; } guess = guessPhoneticNameStyle(name.phoneticMiddleName); if (guess != FullNameStyle.UNDEFINED) { if (guess != FullNameStyle.CJK) { name.phoneticNameStyle = guess; return; } bestGuess = guess; } } public int guessPhoneticNameStyle(String name) { if (name == null) { return PhoneticNameStyle.UNDEFINED; } int nameStyle = PhoneticNameStyle.UNDEFINED; int length = name.length(); int offset = 0; while (offset < length) { int codePoint = Character.codePointAt(name, offset); if (Character.isLetter(codePoint)) { UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { return PhoneticNameStyle.JAPANESE; } if (isKoreanUnicodeBlock(unicodeBlock)) { return PhoneticNameStyle.KOREAN; } if (isLatinUnicodeBlock(unicodeBlock)) { return PhoneticNameStyle.PINYIN; } } offset += Character.charCount(codePoint); } return nameStyle; } private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) { return unicodeBlock == UnicodeBlock.BASIC_LATIN || unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT || unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A || unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B || unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL; } private static boolean isCJKUnicodeBlock(UnicodeBlock block) { return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT || block == UnicodeBlock.CJK_COMPATIBILITY || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT; } private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) { return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES || unicodeBlock == UnicodeBlock.HANGUL_JAMO || unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO; } private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) { return unicodeBlock == UnicodeBlock.KATAKANA || unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS || unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS || unicodeBlock == UnicodeBlock.HIRAGANA; } }