1/* GENERATED SOURCE. DO NOT MODIFY. */
2/*
3 *******************************************************************************
4 * Copyright (C) 2009-2015, Google, International Business Machines Corporation
5 * and others. All Rights Reserved.
6 *******************************************************************************
7 */
8package android.icu.impl;
9
10import java.io.BufferedReader;
11import java.io.FileInputStream;
12import java.io.IOException;
13import java.io.InputStream;
14import java.io.InputStreamReader;
15import java.io.UnsupportedEncodingException;
16import java.text.ParsePosition;
17import java.util.Arrays;
18import java.util.Comparator;
19import java.util.LinkedHashSet;
20import java.util.List;
21import java.util.Map;
22import java.util.Map.Entry;
23import java.util.Set;
24import java.util.TreeMap;
25import java.util.regex.Pattern;
26
27import android.icu.text.StringTransform;
28import android.icu.text.SymbolTable;
29import android.icu.text.UnicodeSet;
30import android.icu.util.Freezable;
31
32/**
33 * Contains utilities to supplement the JDK Regex, since it doesn't handle
34 * Unicode well.
35 *
36 * <p>TODO: Move to android.icu.dev.somewhere.
37 * 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools.
38 *
39 * @author markdavis
40 * @hide Only a subset of ICU is exposed in Android
41 */
42public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform {
43    // Note: we don't currently have any state, but intend to in the future,
44    // particularly for the regex style supported.
45
46    private SymbolTable symbolTable;
47
48    /**
49     * Set the symbol table for internal processing
50     * @hide draft / provisional / internal are hidden on Android
51     */
52    public SymbolTable getSymbolTable() {
53        return symbolTable;
54    }
55
56    /**
57     * Get the symbol table for internal processing
58     * @hide draft / provisional / internal are hidden on Android
59     */
60    public UnicodeRegex setSymbolTable(SymbolTable symbolTable) {
61        this.symbolTable = symbolTable;
62        return this;
63    }
64
65    /**
66     * Adds full Unicode property support, with the latest version of Unicode,
67     * to Java Regex, bringing it up to Level 1 (see
68     * http://www.unicode.org/reports/tr18/). It does this by preprocessing the
69     * regex pattern string and interpreting the character classes (\p{...},
70     * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
71     * this utility, Java regex expressions can be updated to work with the
72     * latest version of Unicode, and with all Unicode properties. Note that the
73     * UnicodeSet syntax has not yet, however, been updated to be completely
74     * consistent with Java regex, so be careful of the differences.
75     * <p>Not thread-safe; create a separate copy for different threads.
76     * <p>In the future, we may extend this to support other regex packages.
77     *
78     * @regex A modified Java regex pattern, as in the input to
79     *        Pattern.compile(), except that all "character classes" are
80     *        processed as if they were UnicodeSet patterns. Example:
81     *        "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
82     * @return A processed Java regex pattern, suitable for input to
83     *         Pattern.compile().
84     */
85    public String transform(String regex) {
86        StringBuilder result = new StringBuilder();
87        UnicodeSet temp = new UnicodeSet();
88        ParsePosition pos = new ParsePosition(0);
89        int state = 0; // 1 = after \
90
91        // We add each character unmodified to the output, unless we have a
92        // UnicodeSet. Note that we don't worry about supplementary characters,
93        // since none of the syntax uses them.
94
95        for (int i = 0; i < regex.length(); ++i) {
96            // look for UnicodeSets, allowing for quoting with \ and \Q
97            char ch = regex.charAt(i);
98            switch (state) {
99            case 0: // we only care about \, and '['.
100                if (ch == '\\') {
101                    if (UnicodeSet.resemblesPattern(regex, i)) {
102                        // should only happen with \p
103                        i = processSet(regex, i, result, temp, pos);
104                        continue;
105                    }
106                    state = 1;
107                } else if (ch == '[') {
108                    // if we have what looks like a UnicodeSet
109                    if (UnicodeSet.resemblesPattern(regex, i)) {
110                        i = processSet(regex, i, result, temp, pos);
111                        continue;
112                    }
113                }
114                break;
115
116            case 1: // we are after a \
117                if (ch == 'Q') {
118                    state = 1;
119                } else {
120                    state = 0;
121                }
122                break;
123
124            case 2: // we are in a \Q...
125                if (ch == '\\') {
126                    state = 3;
127                }
128                break;
129
130            case 3: // we are in at \Q...\
131                if (ch == 'E') {
132                    state = 0;
133                }
134                state = 2;
135                break;
136            }
137            result.append(ch);
138        }
139        return result.toString();
140    }
141
142    /**
143     * Convenience static function, using standard parameters.
144     * @param regex as in process()
145     * @return processed regex pattern, as in process()
146     */
147    public static String fix(String regex) {
148        return STANDARD.transform(regex);
149    }
150
151    /**
152     * Compile a regex string, after processing by fix(...).
153     *
154     * @param regex Raw regex pattern, as in fix(...).
155     * @return Pattern
156     */
157    public static Pattern compile(String regex) {
158        return Pattern.compile(STANDARD.transform(regex));
159    }
160
161    /**
162     * Compile a regex string, after processing by fix(...).
163     *
164     * @param regex Raw regex pattern, as in fix(...).
165     * @return Pattern
166     */
167    public static Pattern compile(String regex, int options) {
168        return Pattern.compile(STANDARD.transform(regex), options);
169    }
170
171    /**
172     * Compile a composed string from a set of BNF lines; see the List version for more information.
173     *
174     * @param bnfLines Series of BNF lines.
175     * @return Pattern
176     */
177    public String compileBnf(String bnfLines) {
178        return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));
179    }
180
181    /**
182     * Compile a composed string from a set of BNF lines, such as for composing a regex
183     * expression. The lines can be in any order, but there must not be any
184     * cycles. The result can be used as input for fix().
185     * <p>
186     * Example:
187     * <pre>
188     * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
189     * scheme = reserved+;
190     * host = // reserved+;
191     * query = [\\=reserved]+;
192     * fragment = reserved+;
193     * reserved = [[:ascii:][:alphabetic:]];
194     * </pre>
195     * <p>
196     * Caveats: at this point the parsing is simple; for example, # cannot be
197     * quoted (use \\u0023); you can set it to null to disable.
198     * The equality sign and a few others can be reset with
199     * setBnfX().
200     *
201     * @param lines Series of lines that represent a BNF expression. The lines contain
202     *          a series of statements that of the form x=y;. A statement can take
203     *          multiple lines, but there can't be multiple statements on a line.
204     *          A hash quotes to the end of the line.
205     * @return Pattern
206     */
207    public String compileBnf(List<String> lines) {
208        Map<String, String> variables = getVariables(lines);
209        Set<String> unused = new LinkedHashSet<String>(variables.keySet());
210        // brute force replacement; do twice to allow for different order
211        // later on can optimize
212        for (int i = 0; i < 2; ++i) {
213            for (Entry<String, String> entry : variables.entrySet()) {
214                String variable   = entry.getKey(),
215                       definition = entry.getValue();
216
217                for (Entry<String, String> entry2 : variables.entrySet()) {
218                    String variable2 = entry2.getKey(),
219                           definition2 = entry2.getValue();
220                    if (variable.equals(variable2)) {
221                        continue;
222                    }
223                    String altered2 = definition2.replace(variable, definition);
224                    if (!altered2.equals(definition2)) {
225                        unused.remove(variable);
226                        variables.put(variable2, altered2);
227                        if (log != null) {
228                            try {
229                                log.append(variable2 + "=" + altered2 + ";");
230                            } catch (IOException e) {
231                                throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
232                            }
233                        }
234                    }
235                }
236            }
237        }
238        if (unused.size() != 1) {
239            throw new IllegalArgumentException("Not a single root: " + unused);
240        }
241        return variables.get(unused.iterator().next());
242    }
243
244    public String getBnfCommentString() {
245        return bnfCommentString;
246    }
247
248    public void setBnfCommentString(String bnfCommentString) {
249        this.bnfCommentString = bnfCommentString;
250    }
251
252    public String getBnfVariableInfix() {
253        return bnfVariableInfix;
254    }
255
256    public void setBnfVariableInfix(String bnfVariableInfix) {
257        this.bnfVariableInfix = bnfVariableInfix;
258    }
259
260    public String getBnfLineSeparator() {
261        return bnfLineSeparator;
262    }
263
264    public void setBnfLineSeparator(String bnfLineSeparator) {
265        this.bnfLineSeparator = bnfLineSeparator;
266    }
267
268    /**
269     * Utility for loading lines from a file.
270     * @param result The result of the appended lines.
271     * @param file The file to have an input stream.
272     * @param encoding if null, then UTF-8
273     * @return filled list
274     * @throws IOException If there were problems opening the file for input stream.
275     */
276    public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException {
277        InputStream is = new FileInputStream(file);
278        try {
279            return appendLines(result, is, encoding);
280        } finally {
281            is.close();
282        }
283    }
284
285    /**
286     * Utility for loading lines from a UTF8 file.
287     * @param result The result of the appended lines.
288     * @param inputStream The input stream.
289     * @param encoding if null, then UTF-8
290     * @return filled list
291     * @throws IOException  If there were problems opening the input stream for reading.
292     */
293    public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding)
294            throws UnsupportedEncodingException, IOException {
295        BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));
296        while (true) {
297            String line = in.readLine();
298            if (line == null) break;
299            result.add(line);
300        }
301        return result;
302    }
303
304
305
306    /* (non-Javadoc)
307     * @see android.icu.util.Freezable#cloneAsThawed()
308     */
309    public UnicodeRegex cloneAsThawed() {
310        // TODO Auto-generated method stub
311        try {
312            return (UnicodeRegex)clone();
313        } catch (CloneNotSupportedException e) {
314            throw new IllegalArgumentException(); // should never happen
315        }
316    }
317
318    /* (non-Javadoc)
319     * @see android.icu.util.Freezable#freeze()
320     */
321    public UnicodeRegex freeze() {
322        // no action needed now.
323        return this;
324    }
325
326    /* (non-Javadoc)
327     * @see android.icu.util.Freezable#isFrozen()
328     */
329    public boolean isFrozen() {
330        // at this point, always true
331        return true;
332    }
333
334    // ===== PRIVATES =====
335
336    private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
337        try {
338            pos.setIndex(i);
339            UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
340            x.complement().complement(); // hack to fix toPattern
341            result.append(x.toPattern(false));
342            i = pos.getIndex() - 1; // allow for the loop increment
343            return i;
344        } catch (Exception e) {
345            throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
346        }
347    }
348
349    private static UnicodeRegex STANDARD = new UnicodeRegex();
350    private String bnfCommentString = "#";
351    private String bnfVariableInfix = "=";
352    private String bnfLineSeparator = "\n";
353    private Appendable log = null;
354
355    private Comparator<Object> LongestFirst = new Comparator<Object>() {
356        public int compare(Object obj0, Object obj1) {
357            String arg0 = obj0.toString();
358            String arg1 = obj1.toString();
359            int len0 = arg0.length();
360            int len1 = arg1.length();
361            if (len0 != len1) return len1 - len0;
362            return arg0.compareTo(arg1);
363        }
364    };
365
366    private Map<String, String> getVariables(List<String> lines) {
367        Map<String, String> variables = new TreeMap<String, String>(LongestFirst);
368        String variable = null;
369        StringBuffer definition = new StringBuffer();
370        int count = 0;
371        for (String line : lines) {
372            ++count;
373            // remove initial bom, comments
374            if (line.length() == 0) continue;
375            if (line.charAt(0) == '\uFEFF') line = line.substring(1);
376
377            if (bnfCommentString != null) {
378                int hashPos = line.indexOf(bnfCommentString);
379                if (hashPos >= 0) line = line.substring(0, hashPos);
380            }
381            String trimline = line.trim();
382            if (trimline.length() == 0) continue;
383
384            // String[] lineParts = line.split(";");
385            String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");
386            if (linePart.trim().length() == 0) continue;
387            boolean terminated = trimline.endsWith(";");
388            if (terminated) {
389                linePart = linePart.substring(0,linePart.lastIndexOf(';'));
390            }
391            int equalsPos = linePart.indexOf(bnfVariableInfix);
392            if (equalsPos >= 0) {
393                if (variable != null) {
394                    throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);
395                }
396                variable = linePart.substring(0,equalsPos).trim();
397                if (variables.containsKey(variable)) {
398                    throw new IllegalArgumentException("Duplicate variable definition in " + line);
399                }
400                definition.append(linePart.substring(equalsPos+1).trim());
401            } else { // no equals, so
402                if (variable == null) {
403                    throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);
404                }
405                definition.append(bnfLineSeparator).append(linePart);
406            }
407            // we are terminated if i is not at the end, or the line ends with a ;
408            if (terminated) {
409                variables.put(variable, definition.toString());
410                variable = null; // signal we have no variable
411                definition.setLength(0);
412            }
413        }
414        if (variable != null) {
415            throw new IllegalArgumentException("Missing ';' at end");
416        }
417        return variables;
418    }
419}
420