1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 package com.android.providers.contacts; 17 18 import android.content.ContentValues; 19 import android.provider.ContactsContract.CommonDataKinds.StructuredName; 20 import android.provider.ContactsContract.FullNameStyle; 21 import android.provider.ContactsContract.PhoneticNameStyle; 22 import android.text.TextUtils; 23 24 import com.android.providers.contacts.util.NeededForTesting; 25 26 import java.lang.Character.UnicodeBlock; 27 import java.util.HashSet; 28 import java.util.Locale; 29 import java.util.StringTokenizer; 30 31 /** 32 * The purpose of this class is to split a full name into given names and last 33 * name. The logic only supports having a single last name. If the full name has 34 * multiple last names the output will be incorrect. 35 * <p> 36 * Core algorithm: 37 * <ol> 38 * <li>Remove the suffixes (III, Ph.D., M.D.).</li> 39 * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li> 40 * <li>Assign the last remaining token as the last name.</li> 41 * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use 42 * this word also as the last name.</li> 43 * <li>Assign the rest of the words as the "given names".</li> 44 * </ol> 45 */ 46 public class NameSplitter { 47 48 public static final int MAX_TOKENS = 10; 49 50 private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase(); 51 private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase(); 52 53 // This includes simplified and traditional Chinese 54 private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase(); 55 56 private final HashSet<String> mPrefixesSet; 57 private final HashSet<String> mSuffixesSet; 58 private final int mMaxSuffixLength; 59 private final HashSet<String> mLastNamePrefixesSet; 60 private final HashSet<String> mConjuctions; 61 private final Locale mLocale; 62 private final String mLanguage; 63 64 /** 65 * Two-Chracter long Korean family names. 66 * http://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EB%B3%B5%EC%84%B1 67 */ 68 private static final String[] KOREAN_TWO_CHARCTER_FAMILY_NAMES = { 69 "\uAC15\uC804", // Gang Jeon 70 "\uB0A8\uAD81", // Nam Goong 71 "\uB3C5\uACE0", // Dok Go 72 "\uB3D9\uBC29", // Dong Bang 73 "\uB9DD\uC808", // Mang Jeol 74 "\uC0AC\uACF5", // Sa Gong 75 "\uC11C\uBB38", // Seo Moon 76 "\uC120\uC6B0", // Seon Woo 77 "\uC18C\uBD09", // So Bong 78 "\uC5B4\uAE08", // Uh Geum 79 "\uC7A5\uACE1", // Jang Gok 80 "\uC81C\uAC08", // Je Gal 81 "\uD669\uBCF4" // Hwang Bo 82 }; 83 84 public static class Name { 85 public String prefix; 86 public String givenNames; 87 public String middleName; 88 public String familyName; 89 public String suffix; 90 91 public int fullNameStyle; 92 93 public String phoneticFamilyName; 94 public String phoneticMiddleName; 95 public String phoneticGivenName; 96 97 public int phoneticNameStyle; 98 Name()99 public Name() { 100 } 101 Name(String prefix, String givenNames, String middleName, String familyName, String suffix)102 public Name(String prefix, String givenNames, String middleName, String familyName, 103 String suffix) { 104 this.prefix = prefix; 105 this.givenNames = givenNames; 106 this.middleName = middleName; 107 this.familyName = familyName; 108 this.suffix = suffix; 109 } 110 111 @NeededForTesting getPrefix()112 public String getPrefix() { 113 return prefix; 114 } 115 getGivenNames()116 public String getGivenNames() { 117 return givenNames; 118 } 119 getMiddleName()120 public String getMiddleName() { 121 return middleName; 122 } 123 getFamilyName()124 public String getFamilyName() { 125 return familyName; 126 } 127 128 @NeededForTesting getSuffix()129 public String getSuffix() { 130 return suffix; 131 } 132 getFullNameStyle()133 public int getFullNameStyle() { 134 return fullNameStyle; 135 } 136 getPhoneticFamilyName()137 public String getPhoneticFamilyName() { 138 return phoneticFamilyName; 139 } 140 getPhoneticMiddleName()141 public String getPhoneticMiddleName() { 142 return phoneticMiddleName; 143 } 144 getPhoneticGivenName()145 public String getPhoneticGivenName() { 146 return phoneticGivenName; 147 } 148 getPhoneticNameStyle()149 public int getPhoneticNameStyle() { 150 return phoneticNameStyle; 151 } 152 fromValues(ContentValues values)153 public void fromValues(ContentValues values) { 154 prefix = values.getAsString(StructuredName.PREFIX); 155 givenNames = values.getAsString(StructuredName.GIVEN_NAME); 156 middleName = values.getAsString(StructuredName.MIDDLE_NAME); 157 familyName = values.getAsString(StructuredName.FAMILY_NAME); 158 suffix = values.getAsString(StructuredName.SUFFIX); 159 160 Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE); 161 fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer; 162 163 phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME); 164 phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME); 165 phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME); 166 167 integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE); 168 phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer; 169 } 170 toValues(ContentValues values)171 public void toValues(ContentValues values) { 172 putValueIfPresent(values, StructuredName.PREFIX, prefix); 173 putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames); 174 putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName); 175 putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName); 176 putValueIfPresent(values, StructuredName.SUFFIX, suffix); 177 values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle); 178 putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName); 179 putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName); 180 putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName); 181 values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle); 182 } 183 putValueIfPresent(ContentValues values, String name, String value)184 private void putValueIfPresent(ContentValues values, String name, String value) { 185 if (value != null) { 186 values.put(name, value); 187 } 188 } 189 clear()190 public void clear() { 191 prefix = null; 192 givenNames = null; 193 middleName = null; 194 familyName = null; 195 suffix = null; 196 fullNameStyle = FullNameStyle.UNDEFINED; 197 phoneticFamilyName = null; 198 phoneticMiddleName = null; 199 phoneticGivenName = null; 200 phoneticNameStyle = PhoneticNameStyle.UNDEFINED; 201 } 202 isEmpty()203 public boolean isEmpty() { 204 return TextUtils.isEmpty(givenNames) 205 && TextUtils.isEmpty(middleName) 206 && TextUtils.isEmpty(familyName) 207 && TextUtils.isEmpty(suffix) 208 && TextUtils.isEmpty(phoneticFamilyName) 209 && TextUtils.isEmpty(phoneticMiddleName) 210 && TextUtils.isEmpty(phoneticGivenName); 211 } 212 213 @Override toString()214 public String toString() { 215 return "[prefix: " + prefix + " given: " + givenNames + " middle: " + middleName 216 + " family: " + familyName + " suffix: " + suffix + " ph/given: " 217 + phoneticGivenName + " ph/middle: " + phoneticMiddleName + " ph/family: " 218 + phoneticFamilyName + "]"; 219 } 220 } 221 222 private static class NameTokenizer extends StringTokenizer { 223 private final String[] mTokens; 224 private int mDotBitmask; 225 private int mCommaBitmask; 226 private int mStartPointer; 227 private int mEndPointer; 228 NameTokenizer(String fullName)229 public NameTokenizer(String fullName) { 230 super(fullName, " .,", true); 231 232 mTokens = new String[MAX_TOKENS]; 233 234 // Iterate over tokens, skipping over empty ones and marking tokens that 235 // are followed by dots. 236 while (hasMoreTokens() && mEndPointer < MAX_TOKENS) { 237 final String token = nextToken(); 238 if (token.length() > 0) { 239 final char c = token.charAt(0); 240 if (c == ' ') { 241 continue; 242 } 243 } 244 245 if (mEndPointer > 0 && token.charAt(0) == '.') { 246 mDotBitmask |= (1 << (mEndPointer - 1)); 247 } else if (mEndPointer > 0 && token.charAt(0) == ',') { 248 mCommaBitmask |= (1 << (mEndPointer - 1)); 249 } else { 250 mTokens[mEndPointer] = token; 251 mEndPointer++; 252 } 253 } 254 } 255 256 /** 257 * Returns true if the token is followed by a dot in the original full name. 258 */ hasDot(int index)259 public boolean hasDot(int index) { 260 return (mDotBitmask & (1 << index)) != 0; 261 } 262 263 /** 264 * Returns true if the token is followed by a comma in the original full name. 265 */ hasComma(int index)266 public boolean hasComma(int index) { 267 return (mCommaBitmask & (1 << index)) != 0; 268 } 269 } 270 271 /** 272 * Constructor. 273 * 274 * @param commonPrefixes comma-separated list of common prefixes, 275 * e.g. "Mr, Ms, Mrs" 276 * @param commonLastNamePrefixes comma-separated list of common last name prefixes, 277 * e.g. "d', st, st., von" 278 * @param commonSuffixes comma-separated list of common suffixes, 279 * e.g. "Jr, M.D., MD, D.D.S." 280 * @param commonConjunctions comma-separated list of common conjuctions, 281 * e.g. "AND, Or" 282 */ NameSplitter(String commonPrefixes, String commonLastNamePrefixes, String commonSuffixes, String commonConjunctions, Locale locale)283 public NameSplitter(String commonPrefixes, String commonLastNamePrefixes, 284 String commonSuffixes, String commonConjunctions, Locale locale) { 285 // TODO: refactor this to use <string-array> resources 286 mPrefixesSet = convertToSet(commonPrefixes); 287 mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes); 288 mSuffixesSet = convertToSet(commonSuffixes); 289 mConjuctions = convertToSet(commonConjunctions); 290 mLocale = locale != null ? locale : Locale.getDefault(); 291 mLanguage = mLocale.getLanguage().toLowerCase(); 292 293 int maxLength = 0; 294 for (String suffix : mSuffixesSet) { 295 if (suffix.length() > maxLength) { 296 maxLength = suffix.length(); 297 } 298 } 299 300 mMaxSuffixLength = maxLength; 301 } 302 303 /** 304 * Converts a comma-separated list of Strings to a set of Strings. Trims strings 305 * and converts them to upper case. 306 */ convertToSet(String strings)307 private static HashSet<String> convertToSet(String strings) { 308 HashSet<String> set = new HashSet<String>(); 309 if (strings != null) { 310 String[] split = strings.split(","); 311 for (int i = 0; i < split.length; i++) { 312 set.add(split[i].trim().toUpperCase()); 313 } 314 } 315 return set; 316 } 317 318 /** 319 * Parses a full name and returns components as a list of tokens. 320 */ tokenize(String[] tokens, String fullName)321 public int tokenize(String[] tokens, String fullName) { 322 if (fullName == null) { 323 return 0; 324 } 325 326 NameTokenizer tokenizer = new NameTokenizer(fullName); 327 328 if (tokenizer.mStartPointer == tokenizer.mEndPointer) { 329 return 0; 330 } 331 332 String firstToken = tokenizer.mTokens[tokenizer.mStartPointer]; 333 int count = 0; 334 for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) { 335 tokens[count++] = tokenizer.mTokens[i]; 336 } 337 338 return count; 339 } 340 341 342 /** 343 * Parses a full name and returns parsed components in the Name object. 344 */ split(Name name, String fullName)345 public void split(Name name, String fullName) { 346 if (fullName == null) { 347 return; 348 } 349 350 int fullNameStyle = guessFullNameStyle(fullName); 351 if (fullNameStyle == FullNameStyle.CJK) { 352 fullNameStyle = getAdjustedFullNameStyle(fullNameStyle); 353 } 354 355 split(name, fullName, fullNameStyle); 356 } 357 358 /** 359 * Parses a full name and returns parsed components in the Name object 360 * with a given fullNameStyle. 361 */ split(Name name, String fullName, int fullNameStyle)362 public void split(Name name, String fullName, int fullNameStyle) { 363 if (fullName == null) { 364 return; 365 } 366 367 name.fullNameStyle = fullNameStyle; 368 369 switch (fullNameStyle) { 370 case FullNameStyle.CHINESE: 371 splitChineseName(name, fullName); 372 break; 373 374 case FullNameStyle.JAPANESE: 375 splitJapaneseName(name, fullName); 376 break; 377 378 case FullNameStyle.KOREAN: 379 splitKoreanName(name, fullName); 380 break; 381 382 default: 383 splitWesternName(name, fullName); 384 } 385 } 386 387 /** 388 * Splits a full name composed according to the Western tradition: 389 * <pre> 390 * [prefix] given name(s) [[middle name] family name] [, suffix] 391 * [prefix] family name, given name [middle name] [,suffix] 392 * </pre> 393 */ splitWesternName(Name name, String fullName)394 private void splitWesternName(Name name, String fullName) { 395 NameTokenizer tokens = new NameTokenizer(fullName); 396 parsePrefix(name, tokens); 397 398 // If the name consists of just one or two tokens, treat them as first/last name, 399 // not as suffix. Example: John Ma; Ma is last name, not "M.A.". 400 if (tokens.mEndPointer > 2) { 401 parseSuffix(name, tokens); 402 } 403 404 if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) { 405 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 406 } else { 407 parseLastName(name, tokens); 408 parseMiddleName(name, tokens); 409 parseGivenNames(name, tokens); 410 } 411 } 412 413 /** 414 * Splits a full name composed according to the Chinese tradition: 415 * <pre> 416 * [family name [middle name]] given name 417 * </pre> 418 */ splitChineseName(Name name, String fullName)419 private void splitChineseName(Name name, String fullName) { 420 StringTokenizer tokenizer = new StringTokenizer(fullName); 421 while (tokenizer.hasMoreTokens()) { 422 String token = tokenizer.nextToken(); 423 if (name.givenNames == null) { 424 name.givenNames = token; 425 } else if (name.familyName == null) { 426 name.familyName = name.givenNames; 427 name.givenNames = token; 428 } else if (name.middleName == null) { 429 name.middleName = name.givenNames; 430 name.givenNames = token; 431 } else { 432 name.middleName = name.middleName + name.givenNames; 433 name.givenNames = token; 434 } 435 } 436 437 // If a single word parse that word up. 438 if (name.givenNames != null && name.familyName == null && name.middleName == null) { 439 int length = fullName.length(); 440 if (length == 2) { 441 name.familyName = fullName.substring(0, 1); 442 name.givenNames = fullName.substring(1); 443 } else if (length == 3) { 444 name.familyName = fullName.substring(0, 1); 445 name.middleName = fullName.substring(1, 2); 446 name.givenNames = fullName.substring(2); 447 } else if (length == 4) { 448 name.familyName = fullName.substring(0, 2); 449 name.middleName = fullName.substring(2, 3); 450 name.givenNames = fullName.substring(3); 451 } 452 453 } 454 } 455 456 /** 457 * Splits a full name composed according to the Japanese tradition: 458 * <pre> 459 * [family name] given name(s) 460 * </pre> 461 */ splitJapaneseName(Name name, String fullName)462 private void splitJapaneseName(Name name, String fullName) { 463 StringTokenizer tokenizer = new StringTokenizer(fullName); 464 while (tokenizer.hasMoreTokens()) { 465 String token = tokenizer.nextToken(); 466 if (name.givenNames == null) { 467 name.givenNames = token; 468 } else if (name.familyName == null) { 469 name.familyName = name.givenNames; 470 name.givenNames = token; 471 } else { 472 name.givenNames += " " + token; 473 } 474 } 475 } 476 477 /** 478 * Splits a full name composed according to the Korean tradition: 479 * <pre> 480 * [family name] given name(s) 481 * </pre> 482 */ splitKoreanName(Name name, String fullName)483 private void splitKoreanName(Name name, String fullName) { 484 StringTokenizer tokenizer = new StringTokenizer(fullName); 485 if (tokenizer.countTokens() > 1) { 486 // Each name can be identified by separators. 487 while (tokenizer.hasMoreTokens()) { 488 String token = tokenizer.nextToken(); 489 if (name.givenNames == null) { 490 name.givenNames = token; 491 } else if (name.familyName == null) { 492 name.familyName = name.givenNames; 493 name.givenNames = token; 494 } else { 495 name.givenNames += " " + token; 496 } 497 } 498 } else { 499 // There is no separator. Try to guess family name. 500 // The length of most family names is 1. 501 int familyNameLength = 1; 502 503 // Compare with 2-length family names. 504 for (String twoLengthFamilyName : KOREAN_TWO_CHARCTER_FAMILY_NAMES) { 505 if (fullName.startsWith(twoLengthFamilyName)) { 506 familyNameLength = 2; 507 break; 508 } 509 } 510 511 name.familyName = fullName.substring(0, familyNameLength); 512 if (fullName.length() > familyNameLength) { 513 name.givenNames = fullName.substring(familyNameLength); 514 } 515 } 516 } 517 518 /** 519 * Concatenates components of a name according to the rules dictated by the name style. 520 * 521 * @param givenNameFirst is ignored for CJK display name styles 522 */ join(Name name, boolean givenNameFirst, boolean includePrefix)523 public String join(Name name, boolean givenNameFirst, boolean includePrefix) { 524 String prefix = includePrefix ? name.prefix : null; 525 switch (name.fullNameStyle) { 526 case FullNameStyle.CJK: 527 case FullNameStyle.CHINESE: 528 case FullNameStyle.KOREAN: 529 return join(prefix, name.familyName, name.middleName, name.givenNames, 530 name.suffix, false, false, false); 531 532 case FullNameStyle.JAPANESE: 533 return join(prefix, name.familyName, name.middleName, name.givenNames, 534 name.suffix, true, false, false); 535 536 default: 537 if (givenNameFirst) { 538 return join(prefix, name.givenNames, name.middleName, name.familyName, 539 name.suffix, true, false, true); 540 } else { 541 return join(prefix, name.familyName, name.givenNames, name.middleName, 542 name.suffix, true, true, true); 543 } 544 } 545 } 546 547 /** 548 * Concatenates components of the phonetic name following the CJK tradition: 549 * family name + middle name + given name(s). 550 */ joinPhoneticName(Name name)551 public String joinPhoneticName(Name name) { 552 return join(null, name.phoneticFamilyName, 553 name.phoneticMiddleName, name.phoneticGivenName, null, true, false, false); 554 } 555 556 /** 557 * Concatenates parts of a full name inserting spaces and commas as specified. 558 */ join(String prefix, String part1, String part2, String part3, String suffix, boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3)559 private String join(String prefix, String part1, String part2, String part3, String suffix, 560 boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) { 561 prefix = prefix == null ? null: prefix.trim(); 562 part1 = part1 == null ? null: part1.trim(); 563 part2 = part2 == null ? null: part2.trim(); 564 part3 = part3 == null ? null: part3.trim(); 565 suffix = suffix == null ? null: suffix.trim(); 566 567 boolean hasPrefix = !TextUtils.isEmpty(prefix); 568 boolean hasPart1 = !TextUtils.isEmpty(part1); 569 boolean hasPart2 = !TextUtils.isEmpty(part2); 570 boolean hasPart3 = !TextUtils.isEmpty(part3); 571 boolean hasSuffix = !TextUtils.isEmpty(suffix); 572 573 boolean isSingleWord = true; 574 String singleWord = null; 575 576 if (hasPrefix) { 577 singleWord = prefix; 578 } 579 580 if (hasPart1) { 581 if (singleWord != null) { 582 isSingleWord = false; 583 } else { 584 singleWord = part1; 585 } 586 } 587 588 if (hasPart2) { 589 if (singleWord != null) { 590 isSingleWord = false; 591 } else { 592 singleWord = part2; 593 } 594 } 595 596 if (hasPart3) { 597 if (singleWord != null) { 598 isSingleWord = false; 599 } else { 600 singleWord = part3; 601 } 602 } 603 604 if (hasSuffix) { 605 if (singleWord != null) { 606 isSingleWord = false; 607 } else { 608 singleWord = normalizedSuffix(suffix); 609 } 610 } 611 612 if (isSingleWord) { 613 return singleWord; 614 } 615 616 StringBuilder sb = new StringBuilder(); 617 618 if (hasPrefix) { 619 sb.append(prefix); 620 } 621 622 if (hasPart1) { 623 if (hasPrefix) { 624 sb.append(' '); 625 } 626 sb.append(part1); 627 } 628 629 if (hasPart2) { 630 if (hasPrefix || hasPart1) { 631 if (useCommaAfterPart1) { 632 sb.append(','); 633 } 634 if (useSpace) { 635 sb.append(' '); 636 } 637 } 638 sb.append(part2); 639 } 640 641 if (hasPart3) { 642 if (hasPrefix || hasPart1 || hasPart2) { 643 if (useSpace) { 644 sb.append(' '); 645 } 646 } 647 sb.append(part3); 648 } 649 650 if (hasSuffix) { 651 if (hasPrefix || hasPart1 || hasPart2 || hasPart3) { 652 if (useCommaAfterPart3) { 653 sb.append(','); 654 } 655 if (useSpace) { 656 sb.append(' '); 657 } 658 } 659 sb.append(normalizedSuffix(suffix)); 660 } 661 662 return sb.toString(); 663 } 664 665 /** 666 * Puts a dot after the supplied suffix if that is the accepted form of the suffix, 667 * e.g. "Jr." and "Sr.", but not "I", "II" and "III". 668 */ normalizedSuffix(String suffix)669 private String normalizedSuffix(String suffix) { 670 int length = suffix.length(); 671 if (length == 0 || suffix.charAt(length - 1) == '.') { 672 return suffix; 673 } 674 675 String withDot = suffix + '.'; 676 if (mSuffixesSet.contains(withDot.toUpperCase())) { 677 return withDot; 678 } else { 679 return suffix; 680 } 681 } 682 683 /** 684 * If the supplied name style is undefined, returns a default based on the language, 685 * otherwise returns the supplied name style itself. 686 * 687 * @param nameStyle See {@link FullNameStyle}. 688 */ getAdjustedFullNameStyle(int nameStyle)689 public int getAdjustedFullNameStyle(int nameStyle) { 690 if (nameStyle == FullNameStyle.UNDEFINED) { 691 if (JAPANESE_LANGUAGE.equals(mLanguage)) { 692 return FullNameStyle.JAPANESE; 693 } else if (KOREAN_LANGUAGE.equals(mLanguage)) { 694 return FullNameStyle.KOREAN; 695 } else if (CHINESE_LANGUAGE.equals(mLanguage)) { 696 return FullNameStyle.CHINESE; 697 } else { 698 return FullNameStyle.WESTERN; 699 } 700 } else if (nameStyle == FullNameStyle.CJK) { 701 if (JAPANESE_LANGUAGE.equals(mLanguage)) { 702 return FullNameStyle.JAPANESE; 703 } else if (KOREAN_LANGUAGE.equals(mLanguage)) { 704 return FullNameStyle.KOREAN; 705 } else { 706 return FullNameStyle.CHINESE; 707 } 708 } 709 return nameStyle; 710 } 711 712 /** 713 * Parses the first word from the name if it is a prefix. 714 */ parsePrefix(Name name, NameTokenizer tokens)715 private void parsePrefix(Name name, NameTokenizer tokens) { 716 if (tokens.mStartPointer == tokens.mEndPointer) { 717 return; 718 } 719 720 String firstToken = tokens.mTokens[tokens.mStartPointer]; 721 if (mPrefixesSet.contains(firstToken.toUpperCase())) { 722 if (tokens.hasDot(tokens.mStartPointer)) { 723 firstToken += '.'; 724 } 725 name.prefix = firstToken; 726 tokens.mStartPointer++; 727 } 728 } 729 730 /** 731 * Parses the last word(s) from the name if it is a suffix. 732 */ parseSuffix(Name name, NameTokenizer tokens)733 private void parseSuffix(Name name, NameTokenizer tokens) { 734 if (tokens.mStartPointer == tokens.mEndPointer) { 735 return; 736 } 737 738 String lastToken = tokens.mTokens[tokens.mEndPointer - 1]; 739 740 // Take care of an explicit comma-separated suffix 741 if (tokens.mEndPointer - tokens.mStartPointer > 2 742 && tokens.hasComma(tokens.mEndPointer - 2)) { 743 if (tokens.hasDot(tokens.mEndPointer - 1)) { 744 lastToken += '.'; 745 } 746 name.suffix = lastToken; 747 tokens.mEndPointer--; 748 return; 749 } 750 751 if (lastToken.length() > mMaxSuffixLength) { 752 return; 753 } 754 755 String normalized = lastToken.toUpperCase(); 756 if (mSuffixesSet.contains(normalized)) { 757 name.suffix = lastToken; 758 tokens.mEndPointer--; 759 return; 760 } 761 762 if (tokens.hasDot(tokens.mEndPointer - 1)) { 763 lastToken += '.'; 764 } 765 normalized += "."; 766 767 // Take care of suffixes like M.D. and D.D.S. 768 int pos = tokens.mEndPointer - 1; 769 while (normalized.length() <= mMaxSuffixLength) { 770 771 if (mSuffixesSet.contains(normalized)) { 772 name.suffix = lastToken; 773 tokens.mEndPointer = pos; 774 return; 775 } 776 777 if (pos == tokens.mStartPointer) { 778 break; 779 } 780 781 pos--; 782 if (tokens.hasDot(pos)) { 783 lastToken = tokens.mTokens[pos] + "." + lastToken; 784 } else { 785 lastToken = tokens.mTokens[pos] + " " + lastToken; 786 } 787 788 normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized; 789 } 790 } 791 parseLastName(Name name, NameTokenizer tokens)792 private void parseLastName(Name name, NameTokenizer tokens) { 793 if (tokens.mStartPointer == tokens.mEndPointer) { 794 return; 795 } 796 797 // If the first word is followed by a comma, assume that it's the family name 798 if (tokens.hasComma(tokens.mStartPointer)) { 799 name.familyName = tokens.mTokens[tokens.mStartPointer]; 800 tokens.mStartPointer++; 801 return; 802 } 803 804 // If the second word is followed by a comma and the first word 805 // is a last name prefix as in "de Sade" and "von Cliburn", treat 806 // the first two words as the family name. 807 if (tokens.mStartPointer + 1 < tokens.mEndPointer 808 && tokens.hasComma(tokens.mStartPointer + 1) 809 && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) { 810 String familyNamePrefix = tokens.mTokens[tokens.mStartPointer]; 811 if (tokens.hasDot(tokens.mStartPointer)) { 812 familyNamePrefix += '.'; 813 } 814 name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1]; 815 tokens.mStartPointer += 2; 816 return; 817 } 818 819 // Finally, assume that the last word is the last name 820 name.familyName = tokens.mTokens[tokens.mEndPointer - 1]; 821 tokens.mEndPointer--; 822 823 // Take care of last names like "de Sade" and "von Cliburn" 824 if ((tokens.mEndPointer - tokens.mStartPointer) > 0) { 825 String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1]; 826 if (isFamilyNamePrefix(lastNamePrefix)) { 827 if (tokens.hasDot(tokens.mEndPointer - 1)) { 828 lastNamePrefix += '.'; 829 } 830 name.familyName = lastNamePrefix + " " + name.familyName; 831 tokens.mEndPointer--; 832 } 833 } 834 } 835 836 /** 837 * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de" 838 */ isFamilyNamePrefix(String word)839 private boolean isFamilyNamePrefix(String word) { 840 final String normalized = word.toUpperCase(); 841 842 return mLastNamePrefixesSet.contains(normalized) 843 || mLastNamePrefixesSet.contains(normalized + "."); 844 } 845 846 parseMiddleName(Name name, NameTokenizer tokens)847 private void parseMiddleName(Name name, NameTokenizer tokens) { 848 if (tokens.mStartPointer == tokens.mEndPointer) { 849 return; 850 } 851 852 if ((tokens.mEndPointer - tokens.mStartPointer) > 1) { 853 if ((tokens.mEndPointer - tokens.mStartPointer) == 2 854 || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2]. 855 toUpperCase())) { 856 name.middleName = tokens.mTokens[tokens.mEndPointer - 1]; 857 if (tokens.hasDot(tokens.mEndPointer - 1)) { 858 name.middleName += '.'; 859 } 860 tokens.mEndPointer--; 861 } 862 } 863 } 864 parseGivenNames(Name name, NameTokenizer tokens)865 private void parseGivenNames(Name name, NameTokenizer tokens) { 866 if (tokens.mStartPointer == tokens.mEndPointer) { 867 return; 868 } 869 870 if ((tokens.mEndPointer - tokens.mStartPointer) == 1) { 871 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 872 } else { 873 StringBuilder sb = new StringBuilder(); 874 for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) { 875 if (i != tokens.mStartPointer) { 876 sb.append(' '); 877 } 878 sb.append(tokens.mTokens[i]); 879 if (tokens.hasDot(i)) { 880 sb.append('.'); 881 } 882 } 883 name.givenNames = sb.toString(); 884 } 885 } 886 887 /** 888 * Makes the best guess at the expected full name style based on the character set 889 * used in the supplied name. If the phonetic name is also supplied, tries to 890 * differentiate between Chinese, Japanese and Korean based on the alphabet used 891 * for the phonetic name. 892 */ guessNameStyle(Name name)893 public void guessNameStyle(Name name) { 894 guessFullNameStyle(name); 895 guessPhoneticNameStyle(name); 896 name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle, 897 name.phoneticNameStyle); 898 } 899 900 /** 901 * Updates the display name style according to the phonetic name style if we 902 * were unsure about display name style based on the name components, but 903 * phonetic name makes it more definitive. 904 */ getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle)905 public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) { 906 if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { 907 if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) { 908 if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) { 909 return FullNameStyle.JAPANESE; 910 } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) { 911 return FullNameStyle.KOREAN; 912 } 913 if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) { 914 return FullNameStyle.CHINESE; 915 } 916 } 917 } 918 return nameStyle; 919 } 920 921 /** 922 * Makes the best guess at the expected full name style based on the character set 923 * used in the supplied name. 924 */ guessFullNameStyle(NameSplitter.Name name)925 private void guessFullNameStyle(NameSplitter.Name name) { 926 if (name.fullNameStyle != FullNameStyle.UNDEFINED) { 927 return; 928 } 929 930 int bestGuess = guessFullNameStyle(name.givenNames); 931 // A mix of Hanzi and latin chars are common in China, so we have to go through all names 932 // if the name is not JANPANESE or KOREAN. 933 if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK 934 && bestGuess != FullNameStyle.WESTERN) { 935 name.fullNameStyle = bestGuess; 936 return; 937 } 938 939 int guess = guessFullNameStyle(name.familyName); 940 if (guess != FullNameStyle.UNDEFINED) { 941 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 942 name.fullNameStyle = guess; 943 return; 944 } 945 bestGuess = guess; 946 } 947 948 guess = guessFullNameStyle(name.middleName); 949 if (guess != FullNameStyle.UNDEFINED) { 950 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 951 name.fullNameStyle = guess; 952 return; 953 } 954 bestGuess = guess; 955 } 956 957 guess = guessFullNameStyle(name.prefix); 958 if (guess != FullNameStyle.UNDEFINED) { 959 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 960 name.fullNameStyle = guess; 961 return; 962 } 963 bestGuess = guess; 964 } 965 966 guess = guessFullNameStyle(name.suffix); 967 if (guess != FullNameStyle.UNDEFINED) { 968 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 969 name.fullNameStyle = guess; 970 return; 971 } 972 bestGuess = guess; 973 } 974 975 name.fullNameStyle = bestGuess; 976 } 977 guessFullNameStyle(String name)978 public int guessFullNameStyle(String name) { 979 if (name == null) { 980 return FullNameStyle.UNDEFINED; 981 } 982 983 int nameStyle = FullNameStyle.UNDEFINED; 984 int length = name.length(); 985 int offset = 0; 986 while (offset < length) { 987 int codePoint = Character.codePointAt(name, offset); 988 if (Character.isLetter(codePoint)) { 989 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 990 991 if (!isLatinUnicodeBlock(unicodeBlock)) { 992 993 if (isCJKUnicodeBlock(unicodeBlock)) { 994 // We don't know if this is Chinese, Japanese or Korean - 995 // trying to figure out by looking at other characters in the name 996 return guessCJKNameStyle(name, offset + Character.charCount(codePoint)); 997 } 998 999 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 1000 return FullNameStyle.JAPANESE; 1001 } 1002 1003 if (isKoreanUnicodeBlock(unicodeBlock)) { 1004 return FullNameStyle.KOREAN; 1005 } 1006 } 1007 nameStyle = FullNameStyle.WESTERN; 1008 } 1009 offset += Character.charCount(codePoint); 1010 } 1011 return nameStyle; 1012 } 1013 guessCJKNameStyle(String name, int offset)1014 private int guessCJKNameStyle(String name, int offset) { 1015 int length = name.length(); 1016 while (offset < length) { 1017 int codePoint = Character.codePointAt(name, offset); 1018 if (Character.isLetter(codePoint)) { 1019 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 1020 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 1021 return FullNameStyle.JAPANESE; 1022 } 1023 if (isKoreanUnicodeBlock(unicodeBlock)) { 1024 return FullNameStyle.KOREAN; 1025 } 1026 } 1027 offset += Character.charCount(codePoint); 1028 } 1029 1030 return FullNameStyle.CJK; 1031 } 1032 guessPhoneticNameStyle(NameSplitter.Name name)1033 private void guessPhoneticNameStyle(NameSplitter.Name name) { 1034 if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { 1035 return; 1036 } 1037 1038 int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName); 1039 if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) { 1040 name.phoneticNameStyle = bestGuess; 1041 return; 1042 } 1043 1044 int guess = guessPhoneticNameStyle(name.phoneticGivenName); 1045 if (guess != FullNameStyle.UNDEFINED) { 1046 if (guess != FullNameStyle.CJK) { 1047 name.phoneticNameStyle = guess; 1048 return; 1049 } 1050 bestGuess = guess; 1051 } 1052 1053 guess = guessPhoneticNameStyle(name.phoneticMiddleName); 1054 if (guess != FullNameStyle.UNDEFINED) { 1055 if (guess != FullNameStyle.CJK) { 1056 name.phoneticNameStyle = guess; 1057 return; 1058 } 1059 bestGuess = guess; 1060 } 1061 } 1062 guessPhoneticNameStyle(String name)1063 public int guessPhoneticNameStyle(String name) { 1064 if (name == null) { 1065 return PhoneticNameStyle.UNDEFINED; 1066 } 1067 1068 int nameStyle = PhoneticNameStyle.UNDEFINED; 1069 int length = name.length(); 1070 int offset = 0; 1071 while (offset < length) { 1072 int codePoint = Character.codePointAt(name, offset); 1073 if (Character.isLetter(codePoint)) { 1074 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 1075 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 1076 return PhoneticNameStyle.JAPANESE; 1077 } 1078 if (isKoreanUnicodeBlock(unicodeBlock)) { 1079 return PhoneticNameStyle.KOREAN; 1080 } 1081 if (isLatinUnicodeBlock(unicodeBlock)) { 1082 return PhoneticNameStyle.PINYIN; 1083 } 1084 } 1085 offset += Character.charCount(codePoint); 1086 } 1087 1088 return nameStyle; 1089 } 1090 isLatinUnicodeBlock(UnicodeBlock unicodeBlock)1091 private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) { 1092 return unicodeBlock == UnicodeBlock.BASIC_LATIN || 1093 unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT || 1094 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A || 1095 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B || 1096 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL; 1097 } 1098 isCJKUnicodeBlock(UnicodeBlock block)1099 private static boolean isCJKUnicodeBlock(UnicodeBlock block) { 1100 return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 1101 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1102 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 1103 || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION 1104 || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT 1105 || block == UnicodeBlock.CJK_COMPATIBILITY 1106 || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS 1107 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 1108 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT; 1109 } 1110 isKoreanUnicodeBlock(UnicodeBlock unicodeBlock)1111 private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) { 1112 return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES || 1113 unicodeBlock == UnicodeBlock.HANGUL_JAMO || 1114 unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO; 1115 } 1116 isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock)1117 private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) { 1118 return unicodeBlock == UnicodeBlock.KATAKANA || 1119 unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS || 1120 unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS || 1121 unicodeBlock == UnicodeBlock.HIRAGANA; 1122 } 1123 } 1124