1 /* 2 * Copyright (c) 2010, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ******************************************************************************* 28 * Copyright (C) 2010, International Business Machines Corporation and * 29 * others. All Rights Reserved. * 30 ******************************************************************************* 31 */ 32 package sun.util.locale; 33 34 import java.util.ArrayList; 35 import java.util.Collections; 36 import java.util.HashMap; 37 import java.util.List; 38 import java.util.Map; 39 import java.util.Set; 40 import java.util.StringJoiner; 41 42 public class LanguageTag { 43 // 44 // static fields 45 // 46 public static final String SEP = "-"; 47 public static final String PRIVATEUSE = "x"; 48 public static final String UNDETERMINED = "und"; 49 public static final String PRIVUSE_VARIANT_PREFIX = "lvariant"; 50 51 // 52 // Language subtag fields 53 // 54 private String language = ""; // language subtag 55 private String script = ""; // script subtag 56 private String region = ""; // region subtag 57 private String privateuse = ""; // privateuse 58 59 private List<String> extlangs = Collections.emptyList(); // extlang subtags 60 private List<String> variants = Collections.emptyList(); // variant subtags 61 private List<String> extensions = Collections.emptyList(); // extensions 62 63 // Map contains legacy language tags and its preferred mappings from 64 // http://www.ietf.org/rfc/rfc5646.txt 65 // Keys are lower-case strings. 66 private static final Map<String, String[]> LEGACY = new HashMap<>(); 67 68 static { 69 // grandfathered = irregular ; non-redundant tags registered 70 // / regular ; during the RFC 3066 era 71 // 72 // irregular = "en-GB-oed" ; irregular tags do not match 73 // / "i-ami" ; the 'langtag' production and 74 // / "i-bnn" ; would not otherwise be 75 // / "i-default" ; considered 'well-formed' 76 // / "i-enochian" ; These tags are all valid, 77 // / "i-hak" ; but most are deprecated 78 // / "i-klingon" ; in favor of more modern 79 // / "i-lux" ; subtags or subtag 80 // / "i-mingo" ; combination 81 // / "i-navajo" 82 // / "i-pwn" 83 // / "i-tao" 84 // / "i-tay" 85 // / "i-tsu" 86 // / "sgn-BE-FR" 87 // / "sgn-BE-NL" 88 // / "sgn-CH-DE" 89 // 90 // regular = "art-lojban" ; these tags match the 'langtag' 91 // / "cel-gaulish" ; production, but their subtags 92 // / "no-bok" ; are not extended language 93 // / "no-nyn" ; or variant subtags: their meaning 94 // / "zh-guoyu" ; is defined by their registration 95 // / "zh-hakka" ; and all of these are deprecated 96 // / "zh-min" ; in favor of a more modern 97 // / "zh-min-nan" ; subtag or sequence of subtags 98 // / "zh-xiang" 99 100 final String[][] entries = { 101 //{"tag", "preferred"}, 102 {"art-lojban", "jbo"}, 103 {"cel-gaulish", "xtg-x-cel-gaulish"}, // fallback 104 {"en-GB-oed", "en-GB-x-oed"}, // fallback 105 {"i-ami", "ami"}, 106 {"i-bnn", "bnn"}, 107 {"i-default", "en-x-i-default"}, // fallback 108 {"i-enochian", "und-x-i-enochian"}, // fallback 109 {"i-hak", "hak"}, 110 {"i-klingon", "tlh"}, 111 {"i-lux", "lb"}, 112 {"i-mingo", "see-x-i-mingo"}, // fallback 113 {"i-navajo", "nv"}, 114 {"i-pwn", "pwn"}, 115 {"i-tao", "tao"}, 116 {"i-tay", "tay"}, 117 {"i-tsu", "tsu"}, 118 {"no-bok", "nb"}, 119 {"no-nyn", "nn"}, 120 {"sgn-BE-FR", "sfb"}, 121 {"sgn-BE-NL", "vgt"}, 122 {"sgn-CH-DE", "sgg"}, 123 {"zh-guoyu", "cmn"}, 124 {"zh-hakka", "hak"}, 125 {"zh-min", "nan-x-zh-min"}, // fallback 126 {"zh-min-nan", "nan"}, 127 {"zh-xiang", "hsn"}, 128 }; 129 for (String[] e : entries) { LocaleUtils.toLowerString(e[0])130 LEGACY.put(LocaleUtils.toLowerString(e[0]), e); 131 } 132 } 133 LanguageTag()134 private LanguageTag() { 135 } 136 137 /* 138 * BNF in RFC5646 139 * 140 * Language-Tag = langtag ; normal language tags 141 * / privateuse ; private use tag 142 * / grandfathered ; grandfathered tags 143 * 144 * 145 * langtag = language 146 * ["-" script] 147 * ["-" region] 148 * *("-" variant) 149 * *("-" extension) 150 * ["-" privateuse] 151 * 152 * language = 2*3ALPHA ; shortest ISO 639 code 153 * ["-" extlang] ; sometimes followed by 154 * ; extended language subtags 155 * / 4ALPHA ; or reserved for future use 156 * / 5*8ALPHA ; or registered language subtag 157 * 158 * extlang = 3ALPHA ; selected ISO 639 codes 159 * *2("-" 3ALPHA) ; permanently reserved 160 * 161 * script = 4ALPHA ; ISO 15924 code 162 * 163 * region = 2ALPHA ; ISO 3166-1 code 164 * / 3DIGIT ; UN M.49 code 165 * 166 * variant = 5*8alphanum ; registered variants 167 * / (DIGIT 3alphanum) 168 * 169 * extension = singleton 1*("-" (2*8alphanum)) 170 * 171 * ; Single alphanumerics 172 * ; "x" reserved for private use 173 * singleton = DIGIT ; 0 - 9 174 * / %x41-57 ; A - W 175 * / %x59-5A ; Y - Z 176 * / %x61-77 ; a - w 177 * / %x79-7A ; y - z 178 * 179 * privateuse = "x" 1*("-" (1*8alphanum)) 180 * 181 */ parse(String languageTag, ParseStatus sts)182 public static LanguageTag parse(String languageTag, ParseStatus sts) { 183 if (sts == null) { 184 sts = new ParseStatus(); 185 } else { 186 sts.reset(); 187 } 188 189 StringTokenIterator itr; 190 191 // Check if the tag is a legacy language tag 192 String[] gfmap = LEGACY.get(LocaleUtils.toLowerString(languageTag)); 193 if (gfmap != null) { 194 // use preferred mapping 195 itr = new StringTokenIterator(gfmap[1], SEP); 196 } else { 197 itr = new StringTokenIterator(languageTag, SEP); 198 } 199 200 LanguageTag tag = new LanguageTag(); 201 202 // langtag must start with either language or privateuse 203 if (tag.parseLanguage(itr, sts)) { 204 tag.parseExtlangs(itr, sts); 205 tag.parseScript(itr, sts); 206 tag.parseRegion(itr, sts); 207 tag.parseVariants(itr, sts); 208 tag.parseExtensions(itr, sts); 209 } 210 tag.parsePrivateuse(itr, sts); 211 212 if (!itr.isDone() && !sts.isError()) { 213 String s = itr.current(); 214 sts.errorIndex = itr.currentStart(); 215 if (s.isEmpty()) { 216 sts.errorMsg = "Empty subtag"; 217 } else { 218 sts.errorMsg = "Invalid subtag: " + s; 219 } 220 } 221 222 return tag; 223 } 224 225 // 226 // Language subtag parsers 227 // 228 parseLanguage(StringTokenIterator itr, ParseStatus sts)229 private boolean parseLanguage(StringTokenIterator itr, ParseStatus sts) { 230 if (itr.isDone() || sts.isError()) { 231 return false; 232 } 233 234 boolean found = false; 235 236 String s = itr.current(); 237 if (isLanguage(s)) { 238 found = true; 239 language = s; 240 sts.parseLength = itr.currentEnd(); 241 itr.next(); 242 } 243 244 return found; 245 } 246 parseExtlangs(StringTokenIterator itr, ParseStatus sts)247 private boolean parseExtlangs(StringTokenIterator itr, ParseStatus sts) { 248 if (itr.isDone() || sts.isError()) { 249 return false; 250 } 251 252 boolean found = false; 253 254 while (!itr.isDone()) { 255 String s = itr.current(); 256 if (!isExtlang(s)) { 257 break; 258 } 259 found = true; 260 if (extlangs.isEmpty()) { 261 extlangs = new ArrayList<>(3); 262 } 263 extlangs.add(s); 264 sts.parseLength = itr.currentEnd(); 265 itr.next(); 266 267 if (extlangs.size() == 3) { 268 // Maximum 3 extlangs 269 break; 270 } 271 } 272 273 return found; 274 } 275 parseScript(StringTokenIterator itr, ParseStatus sts)276 private boolean parseScript(StringTokenIterator itr, ParseStatus sts) { 277 if (itr.isDone() || sts.isError()) { 278 return false; 279 } 280 281 boolean found = false; 282 283 String s = itr.current(); 284 if (isScript(s)) { 285 found = true; 286 script = s; 287 sts.parseLength = itr.currentEnd(); 288 itr.next(); 289 } 290 291 return found; 292 } 293 parseRegion(StringTokenIterator itr, ParseStatus sts)294 private boolean parseRegion(StringTokenIterator itr, ParseStatus sts) { 295 if (itr.isDone() || sts.isError()) { 296 return false; 297 } 298 299 boolean found = false; 300 301 String s = itr.current(); 302 if (isRegion(s)) { 303 found = true; 304 region = s; 305 sts.parseLength = itr.currentEnd(); 306 itr.next(); 307 } 308 309 return found; 310 } 311 parseVariants(StringTokenIterator itr, ParseStatus sts)312 private boolean parseVariants(StringTokenIterator itr, ParseStatus sts) { 313 if (itr.isDone() || sts.isError()) { 314 return false; 315 } 316 317 boolean found = false; 318 319 while (!itr.isDone()) { 320 String s = itr.current(); 321 if (!isVariant(s)) { 322 break; 323 } 324 found = true; 325 if (variants.isEmpty()) { 326 variants = new ArrayList<>(3); 327 } 328 variants.add(s); 329 sts.parseLength = itr.currentEnd(); 330 itr.next(); 331 } 332 333 return found; 334 } 335 parseExtensions(StringTokenIterator itr, ParseStatus sts)336 private boolean parseExtensions(StringTokenIterator itr, ParseStatus sts) { 337 if (itr.isDone() || sts.isError()) { 338 return false; 339 } 340 341 boolean found = false; 342 343 while (!itr.isDone()) { 344 String s = itr.current(); 345 if (isExtensionSingleton(s)) { 346 int start = itr.currentStart(); 347 String singleton = s; 348 StringBuilder sb = new StringBuilder(singleton); 349 350 itr.next(); 351 while (!itr.isDone()) { 352 s = itr.current(); 353 if (isExtensionSubtag(s)) { 354 sb.append(SEP).append(s); 355 sts.parseLength = itr.currentEnd(); 356 } else { 357 break; 358 } 359 itr.next(); 360 } 361 362 if (sts.parseLength <= start) { 363 sts.errorIndex = start; 364 sts.errorMsg = "Incomplete extension '" + singleton + "'"; 365 break; 366 } 367 368 if (extensions.isEmpty()) { 369 extensions = new ArrayList<>(4); 370 } 371 extensions.add(sb.toString()); 372 found = true; 373 } else { 374 break; 375 } 376 } 377 return found; 378 } 379 parsePrivateuse(StringTokenIterator itr, ParseStatus sts)380 private boolean parsePrivateuse(StringTokenIterator itr, ParseStatus sts) { 381 if (itr.isDone() || sts.isError()) { 382 return false; 383 } 384 385 boolean found = false; 386 387 String s = itr.current(); 388 if (isPrivateusePrefix(s)) { 389 int start = itr.currentStart(); 390 StringBuilder sb = new StringBuilder(s); 391 392 itr.next(); 393 while (!itr.isDone()) { 394 s = itr.current(); 395 if (!isPrivateuseSubtag(s)) { 396 break; 397 } 398 sb.append(SEP).append(s); 399 sts.parseLength = itr.currentEnd(); 400 401 itr.next(); 402 } 403 404 if (sts.parseLength <= start) { 405 // need at least 1 private subtag 406 sts.errorIndex = start; 407 sts.errorMsg = "Incomplete privateuse"; 408 } else { 409 privateuse = sb.toString(); 410 found = true; 411 } 412 } 413 414 return found; 415 } 416 parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions)417 public static LanguageTag parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions) { 418 LanguageTag tag = new LanguageTag(); 419 420 String language = baseLocale.getLanguage(); 421 String script = baseLocale.getScript(); 422 String region = baseLocale.getRegion(); 423 String variant = baseLocale.getVariant(); 424 425 boolean hasSubtag = false; 426 427 String privuseVar = null; // store ill-formed variant subtags 428 429 if (isLanguage(language)) { 430 // Convert a deprecated language code to its new code 431 if (language.equals("iw")) { 432 language = "he"; 433 } else if (language.equals("ji")) { 434 language = "yi"; 435 } else if (language.equals("in")) { 436 language = "id"; 437 } 438 tag.language = language; 439 } 440 441 if (isScript(script)) { 442 tag.script = canonicalizeScript(script); 443 hasSubtag = true; 444 } 445 446 if (isRegion(region)) { 447 tag.region = canonicalizeRegion(region); 448 hasSubtag = true; 449 } 450 451 // Special handling for no_NO_NY - use nn_NO for language tag 452 if (tag.language.equals("no") && tag.region.equals("NO") && variant.equals("NY")) { 453 tag.language = "nn"; 454 variant = ""; 455 } 456 457 if (!variant.isEmpty()) { 458 List<String> variants = null; 459 StringTokenIterator varitr = new StringTokenIterator(variant, BaseLocale.SEP); 460 while (!varitr.isDone()) { 461 String var = varitr.current(); 462 if (!isVariant(var)) { 463 break; 464 } 465 if (variants == null) { 466 variants = new ArrayList<>(); 467 } 468 variants.add(var); // Do not canonicalize! 469 varitr.next(); 470 } 471 if (variants != null) { 472 tag.variants = variants; 473 hasSubtag = true; 474 } 475 if (!varitr.isDone()) { 476 // ill-formed variant subtags 477 StringJoiner sj = new StringJoiner(SEP); 478 while (!varitr.isDone()) { 479 String prvv = varitr.current(); 480 if (!isPrivateuseSubtag(prvv)) { 481 // cannot use private use subtag - truncated 482 break; 483 } 484 sj.add(prvv); 485 varitr.next(); 486 } 487 if (sj.length() > 0) { 488 privuseVar = sj.toString(); 489 } 490 } 491 } 492 493 List<String> extensions = null; 494 String privateuse = null; 495 496 if (localeExtensions != null) { 497 Set<Character> locextKeys = localeExtensions.getKeys(); 498 for (Character locextKey : locextKeys) { 499 Extension ext = localeExtensions.getExtension(locextKey); 500 if (isPrivateusePrefixChar(locextKey)) { 501 privateuse = ext.getValue(); 502 } else { 503 if (extensions == null) { 504 extensions = new ArrayList<>(); 505 } 506 extensions.add(locextKey.toString() + SEP + ext.getValue()); 507 } 508 } 509 } 510 511 if (extensions != null) { 512 tag.extensions = extensions; 513 hasSubtag = true; 514 } 515 516 // append ill-formed variant subtags to private use 517 if (privuseVar != null) { 518 if (privateuse == null) { 519 privateuse = PRIVUSE_VARIANT_PREFIX + SEP + privuseVar; 520 } else { 521 privateuse = privateuse + SEP + PRIVUSE_VARIANT_PREFIX 522 + SEP + privuseVar.replace(BaseLocale.SEP, SEP); 523 } 524 } 525 526 if (privateuse != null) { 527 tag.privateuse = privateuse; 528 } 529 530 if (tag.language.isEmpty() && (hasSubtag || privateuse == null)) { 531 // use lang "und" when 1) no language is available AND 532 // 2) any of other subtags other than private use are available or 533 // no private use tag is available 534 tag.language = UNDETERMINED; 535 } 536 537 return tag; 538 } 539 540 // 541 // Getter methods for language subtag fields 542 // 543 getLanguage()544 public String getLanguage() { 545 return language; 546 } 547 getExtlangs()548 public List<String> getExtlangs() { 549 if (extlangs.isEmpty()) { 550 return Collections.emptyList(); 551 } 552 return Collections.unmodifiableList(extlangs); 553 } 554 getScript()555 public String getScript() { 556 return script; 557 } 558 getRegion()559 public String getRegion() { 560 return region; 561 } 562 getVariants()563 public List<String> getVariants() { 564 if (variants.isEmpty()) { 565 return Collections.emptyList(); 566 } 567 return Collections.unmodifiableList(variants); 568 } 569 getExtensions()570 public List<String> getExtensions() { 571 if (extensions.isEmpty()) { 572 return Collections.emptyList(); 573 } 574 return Collections.unmodifiableList(extensions); 575 } 576 getPrivateuse()577 public String getPrivateuse() { 578 return privateuse; 579 } 580 581 // 582 // Language subtag syntax checking methods 583 // 584 isLanguage(String s)585 public static boolean isLanguage(String s) { 586 // language = 2*3ALPHA ; shortest ISO 639 code 587 // ["-" extlang] ; sometimes followed by 588 // ; extended language subtags 589 // / 4ALPHA ; or reserved for future use 590 // / 5*8ALPHA ; or registered language subtag 591 int len = s.length(); 592 return (len >= 2) && (len <= 8) && LocaleUtils.isAlphaString(s); 593 } 594 isExtlang(String s)595 public static boolean isExtlang(String s) { 596 // extlang = 3ALPHA ; selected ISO 639 codes 597 // *2("-" 3ALPHA) ; permanently reserved 598 return (s.length() == 3) && LocaleUtils.isAlphaString(s); 599 } 600 isScript(String s)601 public static boolean isScript(String s) { 602 // script = 4ALPHA ; ISO 15924 code 603 return (s.length() == 4) && LocaleUtils.isAlphaString(s); 604 } 605 isRegion(String s)606 public static boolean isRegion(String s) { 607 // region = 2ALPHA ; ISO 3166-1 code 608 // / 3DIGIT ; UN M.49 code 609 return ((s.length() == 2) && LocaleUtils.isAlphaString(s)) 610 || ((s.length() == 3) && LocaleUtils.isNumericString(s)); 611 } 612 isVariant(String s)613 public static boolean isVariant(String s) { 614 // variant = 5*8alphanum ; registered variants 615 // / (DIGIT 3alphanum) 616 int len = s.length(); 617 if (len >= 5 && len <= 8) { 618 return LocaleUtils.isAlphaNumericString(s); 619 } 620 if (len == 4) { 621 return LocaleUtils.isNumeric(s.charAt(0)) 622 && LocaleUtils.isAlphaNumeric(s.charAt(1)) 623 && LocaleUtils.isAlphaNumeric(s.charAt(2)) 624 && LocaleUtils.isAlphaNumeric(s.charAt(3)); 625 } 626 return false; 627 } 628 isExtensionSingleton(String s)629 public static boolean isExtensionSingleton(String s) { 630 // singleton = DIGIT ; 0 - 9 631 // / %x41-57 ; A - W 632 // / %x59-5A ; Y - Z 633 // / %x61-77 ; a - w 634 // / %x79-7A ; y - z 635 636 return (s.length() == 1) 637 && LocaleUtils.isAlphaString(s) 638 && !LocaleUtils.caseIgnoreMatch(PRIVATEUSE, s); 639 } 640 isExtensionSingletonChar(char c)641 public static boolean isExtensionSingletonChar(char c) { 642 return isExtensionSingleton(String.valueOf(c)); 643 } 644 isExtensionSubtag(String s)645 public static boolean isExtensionSubtag(String s) { 646 // extension = singleton 1*("-" (2*8alphanum)) 647 int len = s.length(); 648 return (len >= 2) && (len <= 8) && LocaleUtils.isAlphaNumericString(s); 649 } 650 isPrivateusePrefix(String s)651 public static boolean isPrivateusePrefix(String s) { 652 // privateuse = "x" 1*("-" (1*8alphanum)) 653 return (s.length() == 1) 654 && LocaleUtils.caseIgnoreMatch(PRIVATEUSE, s); 655 } 656 isPrivateusePrefixChar(char c)657 public static boolean isPrivateusePrefixChar(char c) { 658 return (LocaleUtils.caseIgnoreMatch(PRIVATEUSE, String.valueOf(c))); 659 } 660 isPrivateuseSubtag(String s)661 public static boolean isPrivateuseSubtag(String s) { 662 // privateuse = "x" 1*("-" (1*8alphanum)) 663 int len = s.length(); 664 return (len >= 1) && (len <= 8) && LocaleUtils.isAlphaNumericString(s); 665 } 666 667 // 668 // Language subtag canonicalization methods 669 // 670 canonicalizeLanguage(String s)671 public static String canonicalizeLanguage(String s) { 672 return LocaleUtils.toLowerString(s); 673 } 674 canonicalizeExtlang(String s)675 public static String canonicalizeExtlang(String s) { 676 return LocaleUtils.toLowerString(s); 677 } 678 canonicalizeScript(String s)679 public static String canonicalizeScript(String s) { 680 return LocaleUtils.toTitleString(s); 681 } 682 canonicalizeRegion(String s)683 public static String canonicalizeRegion(String s) { 684 return LocaleUtils.toUpperString(s); 685 } 686 canonicalizeVariant(String s)687 public static String canonicalizeVariant(String s) { 688 return LocaleUtils.toLowerString(s); 689 } 690 canonicalizeExtension(String s)691 public static String canonicalizeExtension(String s) { 692 return LocaleUtils.toLowerString(s); 693 } 694 canonicalizeExtensionSingleton(String s)695 public static String canonicalizeExtensionSingleton(String s) { 696 return LocaleUtils.toLowerString(s); 697 } 698 canonicalizeExtensionSubtag(String s)699 public static String canonicalizeExtensionSubtag(String s) { 700 return LocaleUtils.toLowerString(s); 701 } 702 canonicalizePrivateuse(String s)703 public static String canonicalizePrivateuse(String s) { 704 return LocaleUtils.toLowerString(s); 705 } 706 canonicalizePrivateuseSubtag(String s)707 public static String canonicalizePrivateuseSubtag(String s) { 708 return LocaleUtils.toLowerString(s); 709 } 710 711 @Override toString()712 public String toString() { 713 StringBuilder sb = new StringBuilder(); 714 715 if (!language.isEmpty()) { 716 sb.append(language); 717 718 for (String extlang : extlangs) { 719 sb.append(SEP).append(extlang); 720 } 721 722 if (!script.isEmpty()) { 723 sb.append(SEP).append(script); 724 } 725 726 if (!region.isEmpty()) { 727 sb.append(SEP).append(region); 728 } 729 730 for (String variant : variants) { 731 sb.append(SEP).append(variant); 732 } 733 734 for (String extension : extensions) { 735 sb.append(SEP).append(extension); 736 } 737 } 738 if (!privateuse.isEmpty()) { 739 if (sb.length() > 0) { 740 sb.append(SEP); 741 } 742 sb.append(privateuse); 743 } 744 745 return sb.toString(); 746 } 747 } 748