1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ******************************************************************************* 5 * Copyright (C) 2010-2013, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9package com.ibm.icu.impl.locale; 10 11import java.util.ArrayList; 12import java.util.Collections; 13import java.util.HashMap; 14import java.util.List; 15import java.util.Map; 16import java.util.Set; 17 18public class LanguageTag { 19 private static final boolean JDKIMPL = false; 20 21 // 22 // static fields 23 // 24 public static final String SEP = "-"; 25 public static final String PRIVATEUSE = "x"; 26 public static String UNDETERMINED = "und"; 27 public static final String PRIVUSE_VARIANT_PREFIX = "lvariant"; 28 29 // 30 // Language subtag fields 31 // 32 private String _language = ""; // language subtag 33 private String _script = ""; // script subtag 34 private String _region = ""; // region subtag 35 private String _privateuse = ""; // privateuse 36 37 private List<String> _extlangs = Collections.emptyList(); // extlang subtags 38 private List<String> _variants = Collections.emptyList(); // variant subtags 39 private List<String> _extensions = Collections.emptyList(); // extensions 40 41 // Map contains grandfathered tags and its preferred mappings from 42 // http://www.ietf.org/rfc/rfc5646.txt 43 private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> GRANDFATHERED = 44 new HashMap<AsciiUtil.CaseInsensitiveKey, String[]>(); 45 46 static { 47 // grandfathered = irregular ; non-redundant tags registered 48 // / regular ; during the RFC 3066 era 49 // 50 // irregular = "en-GB-oed" ; irregular tags do not match 51 // / "i-ami" ; the 'langtag' production and 52 // / "i-bnn" ; would not otherwise be 53 // / "i-default" ; considered 'well-formed' 54 // / "i-enochian" ; These tags are all valid, 55 // / "i-hak" ; but most are deprecated 56 // / "i-klingon" ; in favor of more modern 57 // / "i-lux" ; subtags or subtag 58 // / "i-mingo" ; combination 59 // / "i-navajo" 60 // / "i-pwn" 61 // / "i-tao" 62 // / "i-tay" 63 // / "i-tsu" 64 // / "sgn-BE-FR" 65 // / "sgn-BE-NL" 66 // / "sgn-CH-DE" 67 // 68 // regular = "art-lojban" ; these tags match the 'langtag' 69 // / "cel-gaulish" ; production, but their subtags 70 // / "no-bok" ; are not extended language 71 // / "no-nyn" ; or variant subtags: their meaning 72 // / "zh-guoyu" ; is defined by their registration 73 // / "zh-hakka" ; and all of these are deprecated 74 // / "zh-min" ; in favor of a more modern 75 // / "zh-min-nan" ; subtag or sequence of subtags 76 // / "zh-xiang" 77 78 final String[][] entries = { 79 //{"tag", "preferred"}, 80 {"art-lojban", "jbo"}, 81 {"cel-gaulish", "xtg-x-cel-gaulish"}, // fallback 82 {"en-GB-oed", "en-GB-x-oed"}, // fallback 83 {"i-ami", "ami"}, 84 {"i-bnn", "bnn"}, 85 {"i-default", "en-x-i-default"}, // fallback 86 {"i-enochian", "und-x-i-enochian"}, // fallback 87 {"i-hak", "hak"}, 88 {"i-klingon", "tlh"}, 89 {"i-lux", "lb"}, 90 {"i-mingo", "see-x-i-mingo"}, // fallback 91 {"i-navajo", "nv"}, 92 {"i-pwn", "pwn"}, 93 {"i-tao", "tao"}, 94 {"i-tay", "tay"}, 95 {"i-tsu", "tsu"}, 96 {"no-bok", "nb"}, 97 {"no-nyn", "nn"}, 98 {"sgn-BE-FR", "sfb"}, 99 {"sgn-BE-NL", "vgt"}, 100 {"sgn-CH-DE", "sgg"}, 101 {"zh-guoyu", "cmn"}, 102 {"zh-hakka", "hak"}, 103 {"zh-min", "nan-x-zh-min"}, // fallback 104 {"zh-min-nan", "nan"}, 105 {"zh-xiang", "hsn"}, 106 }; 107 for (String[] e : entries) { 108 GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e); 109 } 110 } 111 112 private LanguageTag() { 113 } 114 115 /* 116 * BNF in RFC5464 117 * 118 * Language-Tag = langtag ; normal language tags 119 * / privateuse ; private use tag 120 * / grandfathered ; grandfathered tags 121 * 122 * 123 * langtag = language 124 * ["-" script] 125 * ["-" region] 126 * *("-" variant) 127 * *("-" extension) 128 * ["-" privateuse] 129 * 130 * language = 2*3ALPHA ; shortest ISO 639 code 131 * ["-" extlang] ; sometimes followed by 132 * ; extended language subtags 133 * / 4ALPHA ; or reserved for future use 134 * / 5*8ALPHA ; or registered language subtag 135 * 136 * extlang = 3ALPHA ; selected ISO 639 codes 137 * *2("-" 3ALPHA) ; permanently reserved 138 * 139 * script = 4ALPHA ; ISO 15924 code 140 * 141 * region = 2ALPHA ; ISO 3166-1 code 142 * / 3DIGIT ; UN M.49 code 143 * 144 * variant = 5*8alphanum ; registered variants 145 * / (DIGIT 3alphanum) 146 * 147 * extension = singleton 1*("-" (2*8alphanum)) 148 * 149 * ; Single alphanumerics 150 * ; "x" reserved for private use 151 * singleton = DIGIT ; 0 - 9 152 * / %x41-57 ; A - W 153 * / %x59-5A ; Y - Z 154 * / %x61-77 ; a - w 155 * / %x79-7A ; y - z 156 * 157 * privateuse = "x" 1*("-" (1*8alphanum)) 158 * 159 */ 160 public static LanguageTag parse(String languageTag, ParseStatus sts) { 161 if (sts == null) { 162 sts = new ParseStatus(); 163 } else { 164 sts.reset(); 165 } 166 167 StringTokenIterator itr; 168 boolean isGrandfathered = false; 169 170 // Check if the tag is grandfathered 171 String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag)); 172 if (gfmap != null) { 173 // use preferred mapping 174 itr = new StringTokenIterator(gfmap[1], SEP); 175 isGrandfathered = true; 176 } else { 177 itr = new StringTokenIterator(languageTag, SEP); 178 } 179 180 LanguageTag tag = new LanguageTag(); 181 182 // langtag must start with either language or privateuse 183 if (tag.parseLanguage(itr, sts)) { 184 tag.parseExtlangs(itr, sts); 185 tag.parseScript(itr, sts); 186 tag.parseRegion(itr, sts); 187 tag.parseVariants(itr, sts); 188 tag.parseExtensions(itr, sts); 189 } 190 tag.parsePrivateuse(itr, sts); 191 192 if (isGrandfathered) { 193 // Grandfathered tag is replaced with a well-formed tag above. 194 // However, the parsed length must be the original tag length. 195 assert (itr.isDone()); 196 assert (!sts.isError()); 197 sts._parseLength = languageTag.length(); 198 } else if (!itr.isDone() && !sts.isError()) { 199 String s = itr.current(); 200 sts._errorIndex = itr.currentStart(); 201 if (s.length() == 0) { 202 sts._errorMsg = "Empty subtag"; 203 } else { 204 sts._errorMsg = "Invalid subtag: " + s; 205 } 206 } 207 208 return tag; 209 } 210 211 // 212 // Language subtag parsers 213 // 214 215 private boolean parseLanguage(StringTokenIterator itr, ParseStatus sts) { 216 if (itr.isDone() || sts.isError()) { 217 return false; 218 } 219 220 boolean found = false; 221 222 String s = itr.current(); 223 if (isLanguage(s)) { 224 found = true; 225 _language = s; 226 sts._parseLength = itr.currentEnd(); 227 itr.next(); 228 } 229 230 return found; 231 } 232 233 private boolean parseExtlangs(StringTokenIterator itr, ParseStatus sts) { 234 if (itr.isDone() || sts.isError()) { 235 return false; 236 } 237 238 boolean found = false; 239 240 while (!itr.isDone()) { 241 String s = itr.current(); 242 if (!isExtlang(s)) { 243 break; 244 } 245 found = true; 246 if (_extlangs.isEmpty()) { 247 _extlangs = new ArrayList<String>(3); 248 } 249 _extlangs.add(s); 250 sts._parseLength = itr.currentEnd(); 251 itr.next(); 252 253 if (_extlangs.size() == 3) { 254 // Maximum 3 extlangs 255 break; 256 } 257 } 258 259 return found; 260 } 261 262 private boolean parseScript(StringTokenIterator itr, ParseStatus sts) { 263 if (itr.isDone() || sts.isError()) { 264 return false; 265 } 266 267 boolean found = false; 268 269 String s = itr.current(); 270 if (isScript(s)) { 271 found = true; 272 _script = s; 273 sts._parseLength = itr.currentEnd(); 274 itr.next(); 275 } 276 277 return found; 278 } 279 280 private boolean parseRegion(StringTokenIterator itr, ParseStatus sts) { 281 if (itr.isDone() || sts.isError()) { 282 return false; 283 } 284 285 boolean found = false; 286 287 String s = itr.current(); 288 if (isRegion(s)) { 289 found = true; 290 _region = s; 291 sts._parseLength = itr.currentEnd(); 292 itr.next(); 293 } 294 295 return found; 296 } 297 298 private boolean parseVariants(StringTokenIterator itr, ParseStatus sts) { 299 if (itr.isDone() || sts.isError()) { 300 return false; 301 } 302 303 boolean found = false; 304 305 while (!itr.isDone()) { 306 String s = itr.current(); 307 if (!isVariant(s)) { 308 break; 309 } 310 found = true; 311 if (_variants.isEmpty()) { 312 _variants = new ArrayList<String>(3); 313 } 314 _variants.add(s); 315 sts._parseLength = itr.currentEnd(); 316 itr.next(); 317 } 318 319 return found; 320 } 321 322 private boolean parseExtensions(StringTokenIterator itr, ParseStatus sts) { 323 if (itr.isDone() || sts.isError()) { 324 return false; 325 } 326 327 boolean found = false; 328 329 while (!itr.isDone()) { 330 String s = itr.current(); 331 if (isExtensionSingleton(s)) { 332 int start = itr.currentStart(); 333 String singleton = s; 334 StringBuilder sb = new StringBuilder(singleton); 335 336 itr.next(); 337 while (!itr.isDone()) { 338 s = itr.current(); 339 if (isExtensionSubtag(s)) { 340 sb.append(SEP).append(s); 341 sts._parseLength = itr.currentEnd(); 342 } else { 343 break; 344 } 345 itr.next(); 346 } 347 348 if (sts._parseLength <= start) { 349 sts._errorIndex = start; 350 sts._errorMsg = "Incomplete extension '" + singleton + "'"; 351 break; 352 } 353 354 if (_extensions.size() == 0) { 355 _extensions = new ArrayList<String>(4); 356 } 357 _extensions.add(sb.toString()); 358 found = true; 359 } else { 360 break; 361 } 362 } 363 return found; 364 } 365 366 private boolean parsePrivateuse(StringTokenIterator itr, ParseStatus sts) { 367 if (itr.isDone() || sts.isError()) { 368 return false; 369 } 370 371 boolean found = false; 372 373 String s = itr.current(); 374 if (isPrivateusePrefix(s)) { 375 int start = itr.currentStart(); 376 StringBuilder sb = new StringBuilder(s); 377 378 itr.next(); 379 while (!itr.isDone()) { 380 s = itr.current(); 381 if (!isPrivateuseSubtag(s)) { 382 break; 383 } 384 sb.append(SEP).append(s); 385 sts._parseLength = itr.currentEnd(); 386 387 itr.next(); 388 } 389 390 if (sts._parseLength <= start) { 391 // need at least 1 private subtag 392 sts._errorIndex = start; 393 sts._errorMsg = "Incomplete privateuse"; 394 } else { 395 _privateuse = sb.toString(); 396 found = true; 397 } 398 } 399 400 return found; 401 } 402 403 public static LanguageTag parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions) { 404 LanguageTag tag = new LanguageTag(); 405 406 String language = baseLocale.getLanguage(); 407 String script = baseLocale.getScript(); 408 String region = baseLocale.getRegion(); 409 String variant = baseLocale.getVariant(); 410 411 boolean hasSubtag = false; 412 413 String privuseVar = null; // store ill-formed variant subtags 414 415 if (language.length() > 0 && isLanguage(language)) { 416 // Convert a deprecated language code used by Java to 417 // a new code 418 if (language.equals("iw")) { 419 language = "he"; 420 } else if (language.equals("ji")) { 421 language = "yi"; 422 } else if (language.equals("in")) { 423 language = "id"; 424 } 425 tag._language = language; 426 } 427 428 if (script.length() > 0 && isScript(script)) { 429 tag._script = canonicalizeScript(script); 430 hasSubtag = true; 431 } 432 433 if (region.length() > 0 && isRegion(region)) { 434 tag._region = canonicalizeRegion(region); 435 hasSubtag = true; 436 } 437 438 if (JDKIMPL) { 439 // Special handling for no_NO_NY - use nn_NO for language tag 440 if (tag._language.equals("no") && tag._region.equals("NO") && variant.equals("NY")) { 441 tag._language = "nn"; 442 variant = ""; 443 } 444 } 445 446 if (variant.length() > 0) { 447 List<String> variants = null; 448 StringTokenIterator varitr = new StringTokenIterator(variant, BaseLocale.SEP); 449 while (!varitr.isDone()) { 450 String var = varitr.current(); 451 if (!isVariant(var)) { 452 break; 453 } 454 if (variants == null) { 455 variants = new ArrayList<String>(); 456 } 457 if (JDKIMPL) { 458 variants.add(var); // Do not canonicalize! 459 } else { 460 variants.add(canonicalizeVariant(var)); 461 } 462 varitr.next(); 463 } 464 if (variants != null) { 465 tag._variants = variants; 466 hasSubtag = true; 467 } 468 if (!varitr.isDone()) { 469 // ill-formed variant subtags 470 StringBuilder buf = new StringBuilder(); 471 while (!varitr.isDone()) { 472 String prvv = varitr.current(); 473 if (!isPrivateuseSubtag(prvv)) { 474 // cannot use private use subtag - truncated 475 break; 476 } 477 if (buf.length() > 0) { 478 buf.append(SEP); 479 } 480 if (!JDKIMPL) { 481 prvv = AsciiUtil.toLowerString(prvv); 482 } 483 buf.append(prvv); 484 varitr.next(); 485 } 486 if (buf.length() > 0) { 487 privuseVar = buf.toString(); 488 } 489 } 490 } 491 492 List<String> extensions = null; 493 String privateuse = null; 494 495 Set<Character> locextKeys = localeExtensions.getKeys(); 496 for (Character locextKey : locextKeys) { 497 Extension ext = localeExtensions.getExtension(locextKey); 498 if (isPrivateusePrefixChar(locextKey.charValue())) { 499 privateuse = ext.getValue(); 500 } else { 501 if (extensions == null) { 502 extensions = new ArrayList<String>(); 503 } 504 extensions.add(locextKey.toString() + SEP + ext.getValue()); 505 } 506 } 507 508 if (extensions != null) { 509 tag._extensions = extensions; 510 hasSubtag = true; 511 } 512 513 // append ill-formed variant subtags to private use 514 if (privuseVar != null) { 515 if (privateuse == null) { 516 privateuse = PRIVUSE_VARIANT_PREFIX + SEP + privuseVar; 517 } else { 518 privateuse = privateuse + SEP + PRIVUSE_VARIANT_PREFIX + SEP + privuseVar.replace(BaseLocale.SEP, SEP); 519 } 520 } 521 522 if (privateuse != null) { 523 tag._privateuse = privateuse; 524 } 525 526 if (tag._language.length() == 0 && (hasSubtag || privateuse == null)) { 527 // use lang "und" when 1) no language is available AND 528 // 2) any of other subtags other than private use are available or 529 // no private use tag is available 530 tag._language = UNDETERMINED; 531 } 532 533 return tag; 534 } 535 536 // 537 // Getter methods for language subtag fields 538 // 539 540 public String getLanguage() { 541 return _language; 542 } 543 544 public List<String> getExtlangs() { 545 return Collections.unmodifiableList(_extlangs); 546 } 547 548 public String getScript() { 549 return _script; 550 } 551 552 public String getRegion() { 553 return _region; 554 } 555 556 public List<String> getVariants() { 557 return Collections.unmodifiableList(_variants); 558 } 559 560 public List<String> getExtensions() { 561 return Collections.unmodifiableList(_extensions); 562 } 563 564 public String getPrivateuse() { 565 return _privateuse; 566 } 567 568 // 569 // Language subtag syntax checking methods 570 // 571 572 public static boolean isLanguage(String s) { 573 // language = 2*3ALPHA ; shortest ISO 639 code 574 // ["-" extlang] ; sometimes followed by 575 // ; extended language subtags 576 // / 4ALPHA ; or reserved for future use 577 // / 5*8ALPHA ; or registered language subtag 578 return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaString(s); 579 } 580 581 public static boolean isExtlang(String s) { 582 // extlang = 3ALPHA ; selected ISO 639 codes 583 // *2("-" 3ALPHA) ; permanently reserved 584 return (s.length() == 3) && AsciiUtil.isAlphaString(s); 585 } 586 587 public static boolean isScript(String s) { 588 // script = 4ALPHA ; ISO 15924 code 589 return (s.length() == 4) && AsciiUtil.isAlphaString(s); 590 } 591 592 public static boolean isRegion(String s) { 593 // region = 2ALPHA ; ISO 3166-1 code 594 // / 3DIGIT ; UN M.49 code 595 return ((s.length() == 2) && AsciiUtil.isAlphaString(s)) 596 || ((s.length() == 3) && AsciiUtil.isNumericString(s)); 597 } 598 599 public static boolean isVariant(String s) { 600 // variant = 5*8alphanum ; registered variants 601 // / (DIGIT 3alphanum) 602 int len = s.length(); 603 if (len >= 5 && len <= 8) { 604 return AsciiUtil.isAlphaNumericString(s); 605 } 606 if (len == 4) { 607 return AsciiUtil.isNumeric(s.charAt(0)) 608 && AsciiUtil.isAlphaNumeric(s.charAt(1)) 609 && AsciiUtil.isAlphaNumeric(s.charAt(2)) 610 && AsciiUtil.isAlphaNumeric(s.charAt(3)); 611 } 612 return false; 613 } 614 615 public static boolean isExtensionSingleton(String s) { 616 // singleton = DIGIT ; 0 - 9 617 // / %x41-57 ; A - W 618 // / %x59-5A ; Y - Z 619 // / %x61-77 ; a - w 620 // / %x79-7A ; y - z 621 622 return (s.length() == 1) 623 && AsciiUtil.isAlphaString(s) 624 && !AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s); 625 } 626 627 public static boolean isExtensionSingletonChar(char c) { 628 return isExtensionSingleton(String.valueOf(c)); 629 } 630 631 public static boolean isExtensionSubtag(String s) { 632 // extension = singleton 1*("-" (2*8alphanum)) 633 return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s); 634 } 635 636 public static boolean isPrivateusePrefix(String s) { 637 // privateuse = "x" 1*("-" (1*8alphanum)) 638 return (s.length() == 1) 639 && AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s); 640 } 641 642 public static boolean isPrivateusePrefixChar(char c) { 643 return (AsciiUtil.caseIgnoreMatch(PRIVATEUSE, String.valueOf(c))); 644 } 645 646 public static boolean isPrivateuseSubtag(String s) { 647 // privateuse = "x" 1*("-" (1*8alphanum)) 648 return (s.length() >= 1) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s); 649 } 650 651 // 652 // Language subtag canonicalization methods 653 // 654 655 public static String canonicalizeLanguage(String s) { 656 return AsciiUtil.toLowerString(s); 657 } 658 659 public static String canonicalizeExtlang(String s) { 660 return AsciiUtil.toLowerString(s); 661 } 662 663 public static String canonicalizeScript(String s) { 664 return AsciiUtil.toTitleString(s); 665 } 666 667 public static String canonicalizeRegion(String s) { 668 return AsciiUtil.toUpperString(s); 669 } 670 671 public static String canonicalizeVariant(String s) { 672 return AsciiUtil.toLowerString(s); 673 } 674 675 public static String canonicalizeExtension(String s) { 676 return AsciiUtil.toLowerString(s); 677 } 678 679 public static String canonicalizeExtensionSingleton(String s) { 680 return AsciiUtil.toLowerString(s); 681 } 682 683 public static String canonicalizeExtensionSubtag(String s) { 684 return AsciiUtil.toLowerString(s); 685 } 686 687 public static String canonicalizePrivateuse(String s) { 688 return AsciiUtil.toLowerString(s); 689 } 690 691 public static String canonicalizePrivateuseSubtag(String s) { 692 return AsciiUtil.toLowerString(s); 693 } 694 695 @Override 696 public String toString() { 697 StringBuilder sb = new StringBuilder(); 698 699 if (_language.length() > 0) { 700 sb.append(_language); 701 702 for (String extlang : _extlangs) { 703 sb.append(SEP).append(extlang); 704 } 705 706 if (_script.length() > 0) { 707 sb.append(SEP).append(_script); 708 } 709 710 if (_region.length() > 0) { 711 sb.append(SEP).append(_region); 712 } 713 714 for (String variant : _variants) { 715 sb.append(SEP).append(variant); 716 } 717 718 for (String extension : _extensions) { 719 sb.append(SEP).append(extension); 720 } 721 } 722 if (_privateuse.length() > 0) { 723 if (sb.length() > 0) { 724 sb.append(SEP); 725 } 726 sb.append(_privateuse); 727 } 728 729 return sb.toString(); 730 } 731} 732