1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 2015-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9package com.ibm.icu.impl.locale;
10
11import java.util.Arrays;
12import java.util.EnumSet;
13import java.util.HashSet;
14import java.util.Set;
15import java.util.regex.Pattern;
16
17import com.ibm.icu.impl.ValidIdentifiers;
18import com.ibm.icu.impl.ValidIdentifiers.Datasubtype;
19import com.ibm.icu.impl.ValidIdentifiers.Datatype;
20import com.ibm.icu.impl.locale.KeyTypeData.ValueType;
21import com.ibm.icu.util.IllformedLocaleException;
22import com.ibm.icu.util.Output;
23import com.ibm.icu.util.ULocale;
24
25/**
26 * @author markdavis
27 *
28 */
29public class LocaleValidityChecker {
30    private final Set<Datasubtype> datasubtypes;
31    private final boolean allowsDeprecated;
32    public static class Where {
33        public Datatype fieldFailure;
34        public String codeFailure;
35
36        public boolean set(Datatype datatype, String code) {
37            fieldFailure = datatype;
38            codeFailure = code;
39            return false;
40        }
41        @Override
42        public String toString() {
43            return fieldFailure == null ? "OK" : "{" + fieldFailure + ", " + codeFailure + "}";
44        }
45    }
46
47    public LocaleValidityChecker(Set<Datasubtype> datasubtypes) {
48        this.datasubtypes = EnumSet.copyOf(datasubtypes);
49        allowsDeprecated = datasubtypes.contains(Datasubtype.deprecated);
50    }
51
52    public LocaleValidityChecker(Datasubtype... datasubtypes) {
53        this.datasubtypes = EnumSet.copyOf(Arrays.asList(datasubtypes));
54        allowsDeprecated = this.datasubtypes.contains(Datasubtype.deprecated);
55    }
56
57    /**
58     * @return the datasubtypes
59     */
60    public Set<Datasubtype> getDatasubtypes() {
61        return EnumSet.copyOf(datasubtypes);
62    }
63
64    static Pattern SEPARATOR = Pattern.compile("[-_]");
65
66    @SuppressWarnings("unused")
67    private static final Pattern VALID_X = Pattern.compile("[a-zA-Z0-9]{2,8}(-[a-zA-Z0-9]{2,8})*");
68
69    public boolean isValid(ULocale locale, Where where) {
70        where.set(null, null);
71        final String language = locale.getLanguage();
72        final String script = locale.getScript();
73        final String region = locale.getCountry();
74        final String variantString = locale.getVariant();
75        final Set<Character> extensionKeys = locale.getExtensionKeys();
76        //        if (language.isEmpty()) {
77        //            // the only case where this is valid is if there is only an 'x' extension string
78        //            if (!script.isEmpty() || !region.isEmpty() || variantString.isEmpty()
79        //                    || extensionKeys.size() != 1 || !extensionKeys.contains('x')) {
80        //                return where.set(Datatype.x, "Null language only with x-...");
81        //            }
82        //            return true; // for x string, wellformedness = valid
83        //        }
84        if (!isValid(Datatype.language, language, where)) {
85            // special case x
86            if (language.equals("x")) {
87                where.set(null, null); // for x, well-formed == valid
88                return true;
89            }
90            return false;
91        }
92        if (!isValid(Datatype.script, script, where)) return false;
93        if (!isValid(Datatype.region, region, where)) return false;
94        if (!variantString.isEmpty()) {
95            for (String variant : SEPARATOR.split(variantString)) {
96                if (!isValid(Datatype.variant, variant, where)) return false;
97            }
98        }
99        for (Character c : extensionKeys) {
100            try {
101                Datatype datatype = Datatype.valueOf(c+"");
102                switch (datatype) {
103                case x:
104                    return true; // if it is syntactic (checked by ULocale) it is valid
105                case t:
106                case u:
107                    if (!isValidU(locale, datatype, locale.getExtension(c), where)) return false;
108                    break;
109                }
110            } catch (Exception e) {
111                return where.set(Datatype.illegal, c+"");
112            }
113        }
114        return true;
115    }
116
117    // TODO combine this with the KeyTypeData.SpecialType, and get it from the type, not the key
118    enum SpecialCase {
119        normal, anything, reorder, codepoints, subdivision, rgKey;
120        static SpecialCase get(String key) {
121            if (key.equals("kr")) {
122                return reorder;
123            } else if (key.equals("vt")) {
124                return codepoints;
125            } else if (key.equals("sd")) {
126                return subdivision;
127            } else if (key.equals("rg")) {
128                return rgKey;
129            } else if (key.equals("x0")) {
130                return anything;
131            } else {
132                return normal;
133            }
134        }
135    }
136
137    /**
138     * @param locale
139     * @param datatype
140     * @param extension
141     * @param where
142     * @return
143     */
144    private boolean isValidU(ULocale locale, Datatype datatype, String extensionString, Where where) {
145        String key = "";
146        int typeCount = 0;
147        ValueType valueType = null;
148        SpecialCase specialCase = null;
149        StringBuilder prefix = new StringBuilder();
150        Set<String> seen = new HashSet<String>();
151
152        StringBuilder tBuffer = datatype == Datatype.t ? new StringBuilder() : null;
153
154        // TODO: is empty -u- valid?
155
156        for (String subtag : SEPARATOR.split(extensionString)) {
157            if (subtag.length() == 2
158                    && (tBuffer == null || subtag.charAt(1) <= '9')) {
159                // if we have accumulated a t buffer, check that first
160                if (tBuffer != null) {
161                    // Check t buffer. Empty after 't' is ok.
162                    if (tBuffer.length() != 0 && !isValidLocale(tBuffer.toString(),where)) {
163                        return false;
164                    }
165                    tBuffer = null;
166                }
167                key = KeyTypeData.toBcpKey(subtag);
168                if (key == null) {
169                    return where.set(datatype, subtag);
170                }
171                if (!allowsDeprecated && KeyTypeData.isDeprecated(key)) {
172                    return where.set(datatype, key);
173                }
174                valueType = KeyTypeData.getValueType(key);
175                specialCase = SpecialCase.get(key);
176                typeCount = 0;
177            } else if (tBuffer != null) {
178                if (tBuffer.length() != 0) {
179                    tBuffer.append('-');
180                }
181                tBuffer.append(subtag);
182            } else {
183                ++typeCount;
184                switch (valueType) {
185                case single:
186                    if (typeCount > 1) {
187                        return where.set(datatype, key+"-"+subtag);
188                    }
189                    break;
190                case incremental:
191                    if (typeCount == 1) {
192                        prefix.setLength(0);
193                        prefix.append(subtag);
194                    } else {
195                        prefix.append('-').append(subtag);
196                        subtag = prefix.toString();
197                    }
198                    break;
199                case multiple:
200                    if (typeCount == 1) {
201                        seen.clear();
202                    }
203                    break;
204                }
205                switch (specialCase) {
206                case anything:
207                    continue;
208                case codepoints:
209                    try {
210                        if (Integer.parseInt(subtag,16) > 0x10FFFF) {
211                            return where.set(datatype, key+"-"+subtag);
212                        }
213                    } catch (NumberFormatException e) {
214                        return where.set(datatype, key+"-"+subtag);
215                    }
216                    continue;
217                case reorder:
218                    boolean newlyAdded = seen.add(subtag.equals("zzzz") ? "others" : subtag);
219                    if (!newlyAdded || !isScriptReorder(subtag)) {
220                        return where.set(datatype, key+"-"+subtag);
221                    }
222                    continue;
223                case subdivision:
224                    if (!isSubdivision(locale, subtag)) {
225                        return where.set(datatype, key+"-"+subtag);
226                    }
227                    continue;
228                case rgKey:
229                    if (subtag.length() < 6 || !subtag.endsWith("zzzz")) {
230                        return where.set(datatype, subtag);
231                    }
232                    if (!isValid(Datatype.region, subtag.substring(0,subtag.length()-4), where)) {
233                        return false;
234                    }
235                    continue;
236                }
237
238                // en-u-sd-usca
239                // en-US-u-sd-usca
240                Output<Boolean> isKnownKey = new Output<Boolean>();
241                Output<Boolean> isSpecialType = new Output<Boolean>();
242                String type = KeyTypeData.toBcpType(key, subtag, isKnownKey, isSpecialType);
243                if (type == null) {
244                    return where.set(datatype, key+"-"+subtag);
245                }
246                if (!allowsDeprecated && KeyTypeData.isDeprecated(key, subtag)) {
247                    return where.set(datatype, key+"-"+subtag);
248                }
249            }
250        }
251        // Check t buffer. Empty after 't' is ok.
252        if (tBuffer != null && tBuffer.length() != 0 && !isValidLocale(tBuffer.toString(),where)) {
253            return false;
254        }
255        return true;
256    }
257
258    /**
259     * @param locale
260     * @param subtag
261     * @return
262     */
263    private boolean isSubdivision(ULocale locale, String subtag) {
264        // First check if the subtag is valid
265        if (subtag.length() < 3) {
266            return false;
267        }
268        String region = subtag.substring(0, subtag.charAt(0) <= '9' ? 3 : 2);
269        String subdivision = subtag.substring(region.length());
270        if (ValidIdentifiers.isValid(Datatype.subdivision, datasubtypes, region, subdivision) == null) {
271            return false;
272        }
273        // Then check for consistency with the locale's region
274        String localeRegion = locale.getCountry();
275        if (localeRegion.isEmpty()) {
276            ULocale max = ULocale.addLikelySubtags(locale);
277            localeRegion = max.getCountry();
278        }
279        if (!region.equalsIgnoreCase(localeRegion)) {
280            return false;
281        }
282        return true;
283    }
284
285    static final Set<String> REORDERING_INCLUDE = new HashSet<String>(Arrays.asList("space", "punct", "symbol", "currency", "digit", "others", "zzzz"));
286    static final Set<String> REORDERING_EXCLUDE = new HashSet<String>(Arrays.asList("zinh", "zyyy"));
287    static final Set<Datasubtype> REGULAR_ONLY = EnumSet.of(Datasubtype.regular);
288    /**
289     * @param subtag
290     * @return
291     */
292    private boolean isScriptReorder(String subtag) {
293        subtag = AsciiUtil.toLowerString(subtag);
294        if (REORDERING_INCLUDE.contains(subtag)) {
295            return true;
296        } else if (REORDERING_EXCLUDE.contains(subtag)) {
297            return false;
298        }
299        return ValidIdentifiers.isValid(Datatype.script, REGULAR_ONLY, subtag) != null;
300        //        space, punct, symbol, currency, digit - core groups of characters below 'a'
301        //        any script code except Common and Inherited.
302        //      sc ; Zinh                             ; Inherited                        ; Qaai
303        //      sc ; Zyyy                             ; Common
304        //        Some pairs of scripts sort primary-equal and always reorder together. For example, Katakana characters are are always reordered with Hiragana.
305        //        others - where all codes not explicitly mentioned should be ordered. The script code Zzzz (Unknown Script) is a synonym for others.        return false;
306    }
307
308    /**
309     * @param extensionString
310     * @param where
311     * @return
312     */
313    private boolean isValidLocale(String extensionString, Where where) {
314        try {
315            ULocale locale = new ULocale.Builder().setLanguageTag(extensionString).build();
316            return isValid(locale, where);
317        } catch (IllformedLocaleException e) {
318            int startIndex = e.getErrorIndex();
319            String[] list = SEPARATOR.split(extensionString.substring(startIndex));
320            return where.set(Datatype.t, list[0]);
321        } catch (Exception e) {
322            return where.set(Datatype.t, e.getMessage());
323        }
324    }
325
326    /**
327     * @param language
328     * @param language2
329     * @return
330     */
331    private boolean isValid(Datatype datatype, String code, Where where) {
332        return code.isEmpty() ? true :
333            ValidIdentifiers.isValid(datatype, datasubtypes, code) != null ? true :
334                where == null ? false
335                        : where.set(datatype, code);
336    }
337}
338