1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2013, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.impl.locale; 8 9 import java.util.ArrayList; 10 import java.util.Collections; 11 import java.util.HashMap; 12 import java.util.List; 13 import java.util.Map; 14 import java.util.Set; 15 16 public class LanguageTag { 17 private static final boolean JDKIMPL = false; 18 19 // 20 // static fields 21 // 22 public static final String SEP = "-"; 23 public static final String PRIVATEUSE = "x"; 24 public static String UNDETERMINED = "und"; 25 public static final String PRIVUSE_VARIANT_PREFIX = "lvariant"; 26 27 // 28 // Language subtag fields 29 // 30 private String _language = ""; // language subtag 31 private String _script = ""; // script subtag 32 private String _region = ""; // region subtag 33 private String _privateuse = ""; // privateuse 34 35 private List<String> _extlangs = Collections.emptyList(); // extlang subtags 36 private List<String> _variants = Collections.emptyList(); // variant subtags 37 private List<String> _extensions = Collections.emptyList(); // extensions 38 39 // Map contains grandfathered tags and its preferred mappings from 40 // http://www.ietf.org/rfc/rfc5646.txt 41 private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> GRANDFATHERED = 42 new HashMap<AsciiUtil.CaseInsensitiveKey, String[]>(); 43 44 static { 45 // grandfathered = irregular ; non-redundant tags registered 46 // / regular ; during the RFC 3066 era 47 // 48 // irregular = "en-GB-oed" ; irregular tags do not match 49 // / "i-ami" ; the 'langtag' production and 50 // / "i-bnn" ; would not otherwise be 51 // / "i-default" ; considered 'well-formed' 52 // / "i-enochian" ; These tags are all valid, 53 // / "i-hak" ; but most are deprecated 54 // / "i-klingon" ; in favor of more modern 55 // / "i-lux" ; subtags or subtag 56 // / "i-mingo" ; combination 57 // / "i-navajo" 58 // / "i-pwn" 59 // / "i-tao" 60 // / "i-tay" 61 // / "i-tsu" 62 // / "sgn-BE-FR" 63 // / "sgn-BE-NL" 64 // / "sgn-CH-DE" 65 // 66 // regular = "art-lojban" ; these tags match the 'langtag' 67 // / "cel-gaulish" ; production, but their subtags 68 // / "no-bok" ; are not extended language 69 // / "no-nyn" ; or variant subtags: their meaning 70 // / "zh-guoyu" ; is defined by their registration 71 // / "zh-hakka" ; and all of these are deprecated 72 // / "zh-min" ; in favor of a more modern 73 // / "zh-min-nan" ; subtag or sequence of subtags 74 // / "zh-xiang" 75 76 final String[][] entries = { 77 //{"tag", "preferred"}, 78 {"art-lojban", "jbo"}, 79 {"cel-gaulish", "xtg-x-cel-gaulish"}, // fallback 80 {"en-GB-oed", "en-GB-x-oed"}, // fallback 81 {"i-ami", "ami"}, 82 {"i-bnn", "bnn"}, 83 {"i-default", "en-x-i-default"}, // fallback 84 {"i-enochian", "und-x-i-enochian"}, // fallback 85 {"i-hak", "hak"}, 86 {"i-klingon", "tlh"}, 87 {"i-lux", "lb"}, 88 {"i-mingo", "see-x-i-mingo"}, // fallback 89 {"i-navajo", "nv"}, 90 {"i-pwn", "pwn"}, 91 {"i-tao", "tao"}, 92 {"i-tay", "tay"}, 93 {"i-tsu", "tsu"}, 94 {"no-bok", "nb"}, 95 {"no-nyn", "nn"}, 96 {"sgn-BE-FR", "sfb"}, 97 {"sgn-BE-NL", "vgt"}, 98 {"sgn-CH-DE", "sgg"}, 99 {"zh-guoyu", "cmn"}, 100 {"zh-hakka", "hak"}, 101 {"zh-min", "nan-x-zh-min"}, // fallback 102 {"zh-min-nan", "nan"}, 103 {"zh-xiang", "hsn"}, 104 }; 105 for (String[] e : entries) { GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e)106 GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e); 107 } 108 } 109 LanguageTag()110 private LanguageTag() { 111 } 112 113 /* 114 * BNF in RFC5464 115 * 116 * Language-Tag = langtag ; normal language tags 117 * / privateuse ; private use tag 118 * / grandfathered ; grandfathered tags 119 * 120 * 121 * langtag = language 122 * ["-" script] 123 * ["-" region] 124 * *("-" variant) 125 * *("-" extension) 126 * ["-" privateuse] 127 * 128 * language = 2*3ALPHA ; shortest ISO 639 code 129 * ["-" extlang] ; sometimes followed by 130 * ; extended language subtags 131 * / 4ALPHA ; or reserved for future use 132 * / 5*8ALPHA ; or registered language subtag 133 * 134 * extlang = 3ALPHA ; selected ISO 639 codes 135 * *2("-" 3ALPHA) ; permanently reserved 136 * 137 * script = 4ALPHA ; ISO 15924 code 138 * 139 * region = 2ALPHA ; ISO 3166-1 code 140 * / 3DIGIT ; UN M.49 code 141 * 142 * variant = 5*8alphanum ; registered variants 143 * / (DIGIT 3alphanum) 144 * 145 * extension = singleton 1*("-" (2*8alphanum)) 146 * 147 * ; Single alphanumerics 148 * ; "x" reserved for private use 149 * singleton = DIGIT ; 0 - 9 150 * / %x41-57 ; A - W 151 * / %x59-5A ; Y - Z 152 * / %x61-77 ; a - w 153 * / %x79-7A ; y - z 154 * 155 * privateuse = "x" 1*("-" (1*8alphanum)) 156 * 157 */ parse(String languageTag, ParseStatus sts)158 public static LanguageTag parse(String languageTag, ParseStatus sts) { 159 if (sts == null) { 160 sts = new ParseStatus(); 161 } else { 162 sts.reset(); 163 } 164 165 StringTokenIterator itr; 166 boolean isGrandfathered = false; 167 168 // Check if the tag is grandfathered 169 String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag)); 170 if (gfmap != null) { 171 // use preferred mapping 172 itr = new StringTokenIterator(gfmap[1], SEP); 173 isGrandfathered = true; 174 } else { 175 itr = new StringTokenIterator(languageTag, SEP); 176 } 177 178 LanguageTag tag = new LanguageTag(); 179 180 // langtag must start with either language or privateuse 181 if (tag.parseLanguage(itr, sts)) { 182 tag.parseExtlangs(itr, sts); 183 tag.parseScript(itr, sts); 184 tag.parseRegion(itr, sts); 185 tag.parseVariants(itr, sts); 186 tag.parseExtensions(itr, sts); 187 } 188 tag.parsePrivateuse(itr, sts); 189 190 if (isGrandfathered) { 191 // Grandfathered tag is replaced with a well-formed tag above. 192 // However, the parsed length must be the original tag length. 193 assert (itr.isDone()); 194 assert (!sts.isError()); 195 sts._parseLength = languageTag.length(); 196 } else if (!itr.isDone() && !sts.isError()) { 197 String s = itr.current(); 198 sts._errorIndex = itr.currentStart(); 199 if (s.length() == 0) { 200 sts._errorMsg = "Empty subtag"; 201 } else { 202 sts._errorMsg = "Invalid subtag: " + s; 203 } 204 } 205 206 return tag; 207 } 208 209 // 210 // Language subtag parsers 211 // 212 parseLanguage(StringTokenIterator itr, ParseStatus sts)213 private boolean parseLanguage(StringTokenIterator itr, ParseStatus sts) { 214 if (itr.isDone() || sts.isError()) { 215 return false; 216 } 217 218 boolean found = false; 219 220 String s = itr.current(); 221 if (isLanguage(s)) { 222 found = true; 223 _language = s; 224 sts._parseLength = itr.currentEnd(); 225 itr.next(); 226 } 227 228 return found; 229 } 230 parseExtlangs(StringTokenIterator itr, ParseStatus sts)231 private boolean parseExtlangs(StringTokenIterator itr, ParseStatus sts) { 232 if (itr.isDone() || sts.isError()) { 233 return false; 234 } 235 236 boolean found = false; 237 238 while (!itr.isDone()) { 239 String s = itr.current(); 240 if (!isExtlang(s)) { 241 break; 242 } 243 found = true; 244 if (_extlangs.isEmpty()) { 245 _extlangs = new ArrayList<String>(3); 246 } 247 _extlangs.add(s); 248 sts._parseLength = itr.currentEnd(); 249 itr.next(); 250 251 if (_extlangs.size() == 3) { 252 // Maximum 3 extlangs 253 break; 254 } 255 } 256 257 return found; 258 } 259 parseScript(StringTokenIterator itr, ParseStatus sts)260 private boolean parseScript(StringTokenIterator itr, ParseStatus sts) { 261 if (itr.isDone() || sts.isError()) { 262 return false; 263 } 264 265 boolean found = false; 266 267 String s = itr.current(); 268 if (isScript(s)) { 269 found = true; 270 _script = s; 271 sts._parseLength = itr.currentEnd(); 272 itr.next(); 273 } 274 275 return found; 276 } 277 parseRegion(StringTokenIterator itr, ParseStatus sts)278 private boolean parseRegion(StringTokenIterator itr, ParseStatus sts) { 279 if (itr.isDone() || sts.isError()) { 280 return false; 281 } 282 283 boolean found = false; 284 285 String s = itr.current(); 286 if (isRegion(s)) { 287 found = true; 288 _region = s; 289 sts._parseLength = itr.currentEnd(); 290 itr.next(); 291 } 292 293 return found; 294 } 295 parseVariants(StringTokenIterator itr, ParseStatus sts)296 private boolean parseVariants(StringTokenIterator itr, ParseStatus sts) { 297 if (itr.isDone() || sts.isError()) { 298 return false; 299 } 300 301 boolean found = false; 302 303 while (!itr.isDone()) { 304 String s = itr.current(); 305 if (!isVariant(s)) { 306 break; 307 } 308 found = true; 309 if (_variants.isEmpty()) { 310 _variants = new ArrayList<String>(3); 311 } 312 _variants.add(s); 313 sts._parseLength = itr.currentEnd(); 314 itr.next(); 315 } 316 317 return found; 318 } 319 parseExtensions(StringTokenIterator itr, ParseStatus sts)320 private boolean parseExtensions(StringTokenIterator itr, ParseStatus sts) { 321 if (itr.isDone() || sts.isError()) { 322 return false; 323 } 324 325 boolean found = false; 326 327 while (!itr.isDone()) { 328 String s = itr.current(); 329 if (isExtensionSingleton(s)) { 330 int start = itr.currentStart(); 331 String singleton = s; 332 StringBuilder sb = new StringBuilder(singleton); 333 334 itr.next(); 335 while (!itr.isDone()) { 336 s = itr.current(); 337 if (isExtensionSubtag(s)) { 338 sb.append(SEP).append(s); 339 sts._parseLength = itr.currentEnd(); 340 } else { 341 break; 342 } 343 itr.next(); 344 } 345 346 if (sts._parseLength <= start) { 347 sts._errorIndex = start; 348 sts._errorMsg = "Incomplete extension '" + singleton + "'"; 349 break; 350 } 351 352 if (_extensions.size() == 0) { 353 _extensions = new ArrayList<String>(4); 354 } 355 _extensions.add(sb.toString()); 356 found = true; 357 } else { 358 break; 359 } 360 } 361 return found; 362 } 363 parsePrivateuse(StringTokenIterator itr, ParseStatus sts)364 private boolean parsePrivateuse(StringTokenIterator itr, ParseStatus sts) { 365 if (itr.isDone() || sts.isError()) { 366 return false; 367 } 368 369 boolean found = false; 370 371 String s = itr.current(); 372 if (isPrivateusePrefix(s)) { 373 int start = itr.currentStart(); 374 StringBuilder sb = new StringBuilder(s); 375 376 itr.next(); 377 while (!itr.isDone()) { 378 s = itr.current(); 379 if (!isPrivateuseSubtag(s)) { 380 break; 381 } 382 sb.append(SEP).append(s); 383 sts._parseLength = itr.currentEnd(); 384 385 itr.next(); 386 } 387 388 if (sts._parseLength <= start) { 389 // need at least 1 private subtag 390 sts._errorIndex = start; 391 sts._errorMsg = "Incomplete privateuse"; 392 } else { 393 _privateuse = sb.toString(); 394 found = true; 395 } 396 } 397 398 return found; 399 } 400 parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions)401 public static LanguageTag parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions) { 402 LanguageTag tag = new LanguageTag(); 403 404 String language = baseLocale.getLanguage(); 405 String script = baseLocale.getScript(); 406 String region = baseLocale.getRegion(); 407 String variant = baseLocale.getVariant(); 408 409 boolean hasSubtag = false; 410 411 String privuseVar = null; // store ill-formed variant subtags 412 413 if (language.length() > 0 && isLanguage(language)) { 414 // Convert a deprecated language code used by Java to 415 // a new code 416 if (language.equals("iw")) { 417 language = "he"; 418 } else if (language.equals("ji")) { 419 language = "yi"; 420 } else if (language.equals("in")) { 421 language = "id"; 422 } 423 tag._language = language; 424 } 425 426 if (script.length() > 0 && isScript(script)) { 427 tag._script = canonicalizeScript(script); 428 hasSubtag = true; 429 } 430 431 if (region.length() > 0 && isRegion(region)) { 432 tag._region = canonicalizeRegion(region); 433 hasSubtag = true; 434 } 435 436 if (JDKIMPL) { 437 // Special handling for no_NO_NY - use nn_NO for language tag 438 if (tag._language.equals("no") && tag._region.equals("NO") && variant.equals("NY")) { 439 tag._language = "nn"; 440 variant = ""; 441 } 442 } 443 444 if (variant.length() > 0) { 445 List<String> variants = null; 446 StringTokenIterator varitr = new StringTokenIterator(variant, BaseLocale.SEP); 447 while (!varitr.isDone()) { 448 String var = varitr.current(); 449 if (!isVariant(var)) { 450 break; 451 } 452 if (variants == null) { 453 variants = new ArrayList<String>(); 454 } 455 if (JDKIMPL) { 456 variants.add(var); // Do not canonicalize! 457 } else { 458 variants.add(canonicalizeVariant(var)); 459 } 460 varitr.next(); 461 } 462 if (variants != null) { 463 tag._variants = variants; 464 hasSubtag = true; 465 } 466 if (!varitr.isDone()) { 467 // ill-formed variant subtags 468 StringBuilder buf = new StringBuilder(); 469 while (!varitr.isDone()) { 470 String prvv = varitr.current(); 471 if (!isPrivateuseSubtag(prvv)) { 472 // cannot use private use subtag - truncated 473 break; 474 } 475 if (buf.length() > 0) { 476 buf.append(SEP); 477 } 478 if (!JDKIMPL) { 479 prvv = AsciiUtil.toLowerString(prvv); 480 } 481 buf.append(prvv); 482 varitr.next(); 483 } 484 if (buf.length() > 0) { 485 privuseVar = buf.toString(); 486 } 487 } 488 } 489 490 List<String> extensions = null; 491 String privateuse = null; 492 493 Set<Character> locextKeys = localeExtensions.getKeys(); 494 for (Character locextKey : locextKeys) { 495 Extension ext = localeExtensions.getExtension(locextKey); 496 if (isPrivateusePrefixChar(locextKey.charValue())) { 497 privateuse = ext.getValue(); 498 } else { 499 if (extensions == null) { 500 extensions = new ArrayList<String>(); 501 } 502 extensions.add(locextKey.toString() + SEP + ext.getValue()); 503 } 504 } 505 506 if (extensions != null) { 507 tag._extensions = extensions; 508 hasSubtag = true; 509 } 510 511 // append ill-formed variant subtags to private use 512 if (privuseVar != null) { 513 if (privateuse == null) { 514 privateuse = PRIVUSE_VARIANT_PREFIX + SEP + privuseVar; 515 } else { 516 privateuse = privateuse + SEP + PRIVUSE_VARIANT_PREFIX + SEP + privuseVar.replace(BaseLocale.SEP, SEP); 517 } 518 } 519 520 if (privateuse != null) { 521 tag._privateuse = privateuse; 522 } 523 524 if (tag._language.length() == 0 && (hasSubtag || privateuse == null)) { 525 // use lang "und" when 1) no language is available AND 526 // 2) any of other subtags other than private use are available or 527 // no private use tag is available 528 tag._language = UNDETERMINED; 529 } 530 531 return tag; 532 } 533 534 // 535 // Getter methods for language subtag fields 536 // 537 getLanguage()538 public String getLanguage() { 539 return _language; 540 } 541 getExtlangs()542 public List<String> getExtlangs() { 543 return Collections.unmodifiableList(_extlangs); 544 } 545 getScript()546 public String getScript() { 547 return _script; 548 } 549 getRegion()550 public String getRegion() { 551 return _region; 552 } 553 getVariants()554 public List<String> getVariants() { 555 return Collections.unmodifiableList(_variants); 556 } 557 getExtensions()558 public List<String> getExtensions() { 559 return Collections.unmodifiableList(_extensions); 560 } 561 getPrivateuse()562 public String getPrivateuse() { 563 return _privateuse; 564 } 565 566 // 567 // Language subtag syntax checking methods 568 // 569 isLanguage(String s)570 public static boolean isLanguage(String s) { 571 // language = 2*3ALPHA ; shortest ISO 639 code 572 // ["-" extlang] ; sometimes followed by 573 // ; extended language subtags 574 // / 4ALPHA ; or reserved for future use 575 // / 5*8ALPHA ; or registered language subtag 576 return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaString(s); 577 } 578 isExtlang(String s)579 public static boolean isExtlang(String s) { 580 // extlang = 3ALPHA ; selected ISO 639 codes 581 // *2("-" 3ALPHA) ; permanently reserved 582 return (s.length() == 3) && AsciiUtil.isAlphaString(s); 583 } 584 isScript(String s)585 public static boolean isScript(String s) { 586 // script = 4ALPHA ; ISO 15924 code 587 return (s.length() == 4) && AsciiUtil.isAlphaString(s); 588 } 589 isRegion(String s)590 public static boolean isRegion(String s) { 591 // region = 2ALPHA ; ISO 3166-1 code 592 // / 3DIGIT ; UN M.49 code 593 return ((s.length() == 2) && AsciiUtil.isAlphaString(s)) 594 || ((s.length() == 3) && AsciiUtil.isNumericString(s)); 595 } 596 isVariant(String s)597 public static boolean isVariant(String s) { 598 // variant = 5*8alphanum ; registered variants 599 // / (DIGIT 3alphanum) 600 int len = s.length(); 601 if (len >= 5 && len <= 8) { 602 return AsciiUtil.isAlphaNumericString(s); 603 } 604 if (len == 4) { 605 return AsciiUtil.isNumeric(s.charAt(0)) 606 && AsciiUtil.isAlphaNumeric(s.charAt(1)) 607 && AsciiUtil.isAlphaNumeric(s.charAt(2)) 608 && AsciiUtil.isAlphaNumeric(s.charAt(3)); 609 } 610 return false; 611 } 612 isExtensionSingleton(String s)613 public static boolean isExtensionSingleton(String s) { 614 // singleton = DIGIT ; 0 - 9 615 // / %x41-57 ; A - W 616 // / %x59-5A ; Y - Z 617 // / %x61-77 ; a - w 618 // / %x79-7A ; y - z 619 620 return (s.length() == 1) 621 && AsciiUtil.isAlphaString(s) 622 && !AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s); 623 } 624 isExtensionSingletonChar(char c)625 public static boolean isExtensionSingletonChar(char c) { 626 return isExtensionSingleton(String.valueOf(c)); 627 } 628 isExtensionSubtag(String s)629 public static boolean isExtensionSubtag(String s) { 630 // extension = singleton 1*("-" (2*8alphanum)) 631 return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s); 632 } 633 isPrivateusePrefix(String s)634 public static boolean isPrivateusePrefix(String s) { 635 // privateuse = "x" 1*("-" (1*8alphanum)) 636 return (s.length() == 1) 637 && AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s); 638 } 639 isPrivateusePrefixChar(char c)640 public static boolean isPrivateusePrefixChar(char c) { 641 return (AsciiUtil.caseIgnoreMatch(PRIVATEUSE, String.valueOf(c))); 642 } 643 isPrivateuseSubtag(String s)644 public static boolean isPrivateuseSubtag(String s) { 645 // privateuse = "x" 1*("-" (1*8alphanum)) 646 return (s.length() >= 1) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s); 647 } 648 649 // 650 // Language subtag canonicalization methods 651 // 652 canonicalizeLanguage(String s)653 public static String canonicalizeLanguage(String s) { 654 return AsciiUtil.toLowerString(s); 655 } 656 canonicalizeExtlang(String s)657 public static String canonicalizeExtlang(String s) { 658 return AsciiUtil.toLowerString(s); 659 } 660 canonicalizeScript(String s)661 public static String canonicalizeScript(String s) { 662 return AsciiUtil.toTitleString(s); 663 } 664 canonicalizeRegion(String s)665 public static String canonicalizeRegion(String s) { 666 return AsciiUtil.toUpperString(s); 667 } 668 canonicalizeVariant(String s)669 public static String canonicalizeVariant(String s) { 670 return AsciiUtil.toLowerString(s); 671 } 672 canonicalizeExtension(String s)673 public static String canonicalizeExtension(String s) { 674 return AsciiUtil.toLowerString(s); 675 } 676 canonicalizeExtensionSingleton(String s)677 public static String canonicalizeExtensionSingleton(String s) { 678 return AsciiUtil.toLowerString(s); 679 } 680 canonicalizeExtensionSubtag(String s)681 public static String canonicalizeExtensionSubtag(String s) { 682 return AsciiUtil.toLowerString(s); 683 } 684 canonicalizePrivateuse(String s)685 public static String canonicalizePrivateuse(String s) { 686 return AsciiUtil.toLowerString(s); 687 } 688 canonicalizePrivateuseSubtag(String s)689 public static String canonicalizePrivateuseSubtag(String s) { 690 return AsciiUtil.toLowerString(s); 691 } 692 toString()693 public String toString() { 694 StringBuilder sb = new StringBuilder(); 695 696 if (_language.length() > 0) { 697 sb.append(_language); 698 699 for (String extlang : _extlangs) { 700 sb.append(SEP).append(extlang); 701 } 702 703 if (_script.length() > 0) { 704 sb.append(SEP).append(_script); 705 } 706 707 if (_region.length() > 0) { 708 sb.append(SEP).append(_region); 709 } 710 711 for (String variant : _variants) { 712 sb.append(SEP).append(variant); 713 } 714 715 for (String extension : _extensions) { 716 sb.append(SEP).append(extension); 717 } 718 } 719 if (_privateuse.length() > 0) { 720 if (sb.length() > 0) { 721 sb.append(SEP); 722 } 723 sb.append(_privateuse); 724 } 725 726 return sb.toString(); 727 } 728 } 729