1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ******************************************************************************* 5 * Copyright (C) 2015-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9package com.ibm.icu.impl.locale; 10 11import java.util.Arrays; 12import java.util.EnumSet; 13import java.util.HashSet; 14import java.util.Set; 15import java.util.regex.Pattern; 16 17import com.ibm.icu.impl.ValidIdentifiers; 18import com.ibm.icu.impl.ValidIdentifiers.Datasubtype; 19import com.ibm.icu.impl.ValidIdentifiers.Datatype; 20import com.ibm.icu.impl.locale.KeyTypeData.ValueType; 21import com.ibm.icu.util.IllformedLocaleException; 22import com.ibm.icu.util.Output; 23import com.ibm.icu.util.ULocale; 24 25/** 26 * @author markdavis 27 * 28 */ 29public class LocaleValidityChecker { 30 private final Set<Datasubtype> datasubtypes; 31 private final boolean allowsDeprecated; 32 public static class Where { 33 public Datatype fieldFailure; 34 public String codeFailure; 35 36 public boolean set(Datatype datatype, String code) { 37 fieldFailure = datatype; 38 codeFailure = code; 39 return false; 40 } 41 @Override 42 public String toString() { 43 return fieldFailure == null ? "OK" : "{" + fieldFailure + ", " + codeFailure + "}"; 44 } 45 } 46 47 public LocaleValidityChecker(Set<Datasubtype> datasubtypes) { 48 this.datasubtypes = EnumSet.copyOf(datasubtypes); 49 allowsDeprecated = datasubtypes.contains(Datasubtype.deprecated); 50 } 51 52 public LocaleValidityChecker(Datasubtype... datasubtypes) { 53 this.datasubtypes = EnumSet.copyOf(Arrays.asList(datasubtypes)); 54 allowsDeprecated = this.datasubtypes.contains(Datasubtype.deprecated); 55 } 56 57 /** 58 * @return the datasubtypes 59 */ 60 public Set<Datasubtype> getDatasubtypes() { 61 return EnumSet.copyOf(datasubtypes); 62 } 63 64 static Pattern SEPARATOR = Pattern.compile("[-_]"); 65 66 @SuppressWarnings("unused") 67 private static final Pattern VALID_X = Pattern.compile("[a-zA-Z0-9]{2,8}(-[a-zA-Z0-9]{2,8})*"); 68 69 public boolean isValid(ULocale locale, Where where) { 70 where.set(null, null); 71 final String language = locale.getLanguage(); 72 final String script = locale.getScript(); 73 final String region = locale.getCountry(); 74 final String variantString = locale.getVariant(); 75 final Set<Character> extensionKeys = locale.getExtensionKeys(); 76 // if (language.isEmpty()) { 77 // // the only case where this is valid is if there is only an 'x' extension string 78 // if (!script.isEmpty() || !region.isEmpty() || variantString.isEmpty() 79 // || extensionKeys.size() != 1 || !extensionKeys.contains('x')) { 80 // return where.set(Datatype.x, "Null language only with x-..."); 81 // } 82 // return true; // for x string, wellformedness = valid 83 // } 84 if (!isValid(Datatype.language, language, where)) { 85 // special case x 86 if (language.equals("x")) { 87 where.set(null, null); // for x, well-formed == valid 88 return true; 89 } 90 return false; 91 } 92 if (!isValid(Datatype.script, script, where)) return false; 93 if (!isValid(Datatype.region, region, where)) return false; 94 if (!variantString.isEmpty()) { 95 for (String variant : SEPARATOR.split(variantString)) { 96 if (!isValid(Datatype.variant, variant, where)) return false; 97 } 98 } 99 for (Character c : extensionKeys) { 100 try { 101 Datatype datatype = Datatype.valueOf(c+""); 102 switch (datatype) { 103 case x: 104 return true; // if it is syntactic (checked by ULocale) it is valid 105 case t: 106 case u: 107 if (!isValidU(locale, datatype, locale.getExtension(c), where)) return false; 108 break; 109 } 110 } catch (Exception e) { 111 return where.set(Datatype.illegal, c+""); 112 } 113 } 114 return true; 115 } 116 117 // TODO combine this with the KeyTypeData.SpecialType, and get it from the type, not the key 118 enum SpecialCase { 119 normal, anything, reorder, codepoints, subdivision, rgKey; 120 static SpecialCase get(String key) { 121 if (key.equals("kr")) { 122 return reorder; 123 } else if (key.equals("vt")) { 124 return codepoints; 125 } else if (key.equals("sd")) { 126 return subdivision; 127 } else if (key.equals("rg")) { 128 return rgKey; 129 } else if (key.equals("x0")) { 130 return anything; 131 } else { 132 return normal; 133 } 134 } 135 } 136 137 /** 138 * @param locale 139 * @param datatype 140 * @param extension 141 * @param where 142 * @return 143 */ 144 private boolean isValidU(ULocale locale, Datatype datatype, String extensionString, Where where) { 145 String key = ""; 146 int typeCount = 0; 147 ValueType valueType = null; 148 SpecialCase specialCase = null; 149 StringBuilder prefix = new StringBuilder(); 150 Set<String> seen = new HashSet<String>(); 151 152 StringBuilder tBuffer = datatype == Datatype.t ? new StringBuilder() : null; 153 154 // TODO: is empty -u- valid? 155 156 for (String subtag : SEPARATOR.split(extensionString)) { 157 if (subtag.length() == 2 158 && (tBuffer == null || subtag.charAt(1) <= '9')) { 159 // if we have accumulated a t buffer, check that first 160 if (tBuffer != null) { 161 // Check t buffer. Empty after 't' is ok. 162 if (tBuffer.length() != 0 && !isValidLocale(tBuffer.toString(),where)) { 163 return false; 164 } 165 tBuffer = null; 166 } 167 key = KeyTypeData.toBcpKey(subtag); 168 if (key == null) { 169 return where.set(datatype, subtag); 170 } 171 if (!allowsDeprecated && KeyTypeData.isDeprecated(key)) { 172 return where.set(datatype, key); 173 } 174 valueType = KeyTypeData.getValueType(key); 175 specialCase = SpecialCase.get(key); 176 typeCount = 0; 177 } else if (tBuffer != null) { 178 if (tBuffer.length() != 0) { 179 tBuffer.append('-'); 180 } 181 tBuffer.append(subtag); 182 } else { 183 ++typeCount; 184 switch (valueType) { 185 case single: 186 if (typeCount > 1) { 187 return where.set(datatype, key+"-"+subtag); 188 } 189 break; 190 case incremental: 191 if (typeCount == 1) { 192 prefix.setLength(0); 193 prefix.append(subtag); 194 } else { 195 prefix.append('-').append(subtag); 196 subtag = prefix.toString(); 197 } 198 break; 199 case multiple: 200 if (typeCount == 1) { 201 seen.clear(); 202 } 203 break; 204 } 205 switch (specialCase) { 206 case anything: 207 continue; 208 case codepoints: 209 try { 210 if (Integer.parseInt(subtag,16) > 0x10FFFF) { 211 return where.set(datatype, key+"-"+subtag); 212 } 213 } catch (NumberFormatException e) { 214 return where.set(datatype, key+"-"+subtag); 215 } 216 continue; 217 case reorder: 218 boolean newlyAdded = seen.add(subtag.equals("zzzz") ? "others" : subtag); 219 if (!newlyAdded || !isScriptReorder(subtag)) { 220 return where.set(datatype, key+"-"+subtag); 221 } 222 continue; 223 case subdivision: 224 if (!isSubdivision(locale, subtag)) { 225 return where.set(datatype, key+"-"+subtag); 226 } 227 continue; 228 case rgKey: 229 if (subtag.length() < 6 || !subtag.endsWith("zzzz")) { 230 return where.set(datatype, subtag); 231 } 232 if (!isValid(Datatype.region, subtag.substring(0,subtag.length()-4), where)) { 233 return false; 234 } 235 continue; 236 } 237 238 // en-u-sd-usca 239 // en-US-u-sd-usca 240 Output<Boolean> isKnownKey = new Output<Boolean>(); 241 Output<Boolean> isSpecialType = new Output<Boolean>(); 242 String type = KeyTypeData.toBcpType(key, subtag, isKnownKey, isSpecialType); 243 if (type == null) { 244 return where.set(datatype, key+"-"+subtag); 245 } 246 if (!allowsDeprecated && KeyTypeData.isDeprecated(key, subtag)) { 247 return where.set(datatype, key+"-"+subtag); 248 } 249 } 250 } 251 // Check t buffer. Empty after 't' is ok. 252 if (tBuffer != null && tBuffer.length() != 0 && !isValidLocale(tBuffer.toString(),where)) { 253 return false; 254 } 255 return true; 256 } 257 258 /** 259 * @param locale 260 * @param subtag 261 * @return 262 */ 263 private boolean isSubdivision(ULocale locale, String subtag) { 264 // First check if the subtag is valid 265 if (subtag.length() < 3) { 266 return false; 267 } 268 String region = subtag.substring(0, subtag.charAt(0) <= '9' ? 3 : 2); 269 String subdivision = subtag.substring(region.length()); 270 if (ValidIdentifiers.isValid(Datatype.subdivision, datasubtypes, region, subdivision) == null) { 271 return false; 272 } 273 // Then check for consistency with the locale's region 274 String localeRegion = locale.getCountry(); 275 if (localeRegion.isEmpty()) { 276 ULocale max = ULocale.addLikelySubtags(locale); 277 localeRegion = max.getCountry(); 278 } 279 if (!region.equalsIgnoreCase(localeRegion)) { 280 return false; 281 } 282 return true; 283 } 284 285 static final Set<String> REORDERING_INCLUDE = new HashSet<String>(Arrays.asList("space", "punct", "symbol", "currency", "digit", "others", "zzzz")); 286 static final Set<String> REORDERING_EXCLUDE = new HashSet<String>(Arrays.asList("zinh", "zyyy")); 287 static final Set<Datasubtype> REGULAR_ONLY = EnumSet.of(Datasubtype.regular); 288 /** 289 * @param subtag 290 * @return 291 */ 292 private boolean isScriptReorder(String subtag) { 293 subtag = AsciiUtil.toLowerString(subtag); 294 if (REORDERING_INCLUDE.contains(subtag)) { 295 return true; 296 } else if (REORDERING_EXCLUDE.contains(subtag)) { 297 return false; 298 } 299 return ValidIdentifiers.isValid(Datatype.script, REGULAR_ONLY, subtag) != null; 300 // space, punct, symbol, currency, digit - core groups of characters below 'a' 301 // any script code except Common and Inherited. 302 // sc ; Zinh ; Inherited ; Qaai 303 // sc ; Zyyy ; Common 304 // Some pairs of scripts sort primary-equal and always reorder together. For example, Katakana characters are are always reordered with Hiragana. 305 // others - where all codes not explicitly mentioned should be ordered. The script code Zzzz (Unknown Script) is a synonym for others. return false; 306 } 307 308 /** 309 * @param extensionString 310 * @param where 311 * @return 312 */ 313 private boolean isValidLocale(String extensionString, Where where) { 314 try { 315 ULocale locale = new ULocale.Builder().setLanguageTag(extensionString).build(); 316 return isValid(locale, where); 317 } catch (IllformedLocaleException e) { 318 int startIndex = e.getErrorIndex(); 319 String[] list = SEPARATOR.split(extensionString.substring(startIndex)); 320 return where.set(Datatype.t, list[0]); 321 } catch (Exception e) { 322 return where.set(Datatype.t, e.getMessage()); 323 } 324 } 325 326 /** 327 * @param language 328 * @param language2 329 * @return 330 */ 331 private boolean isValid(Datatype datatype, String code, Where where) { 332 return code.isEmpty() ? true : 333 ValidIdentifiers.isValid(datatype, datasubtypes, code) != null ? true : 334 where == null ? false 335 : where.set(datatype, code); 336 } 337} 338